In [1]:
import pandas as pd
import json

In [2]:
with open("option_columns.json","r") as f:
    option_columns = json.load(f)

#Find the dataset
# df = pd.read_csv("../tuxml-datasets-copie/dataset_encoded.csv", dtype={k:"int8" for k in option_columns})
df = pd.read_csv("../tuxml-datasets-copie/dataset_encoded.csv", dtype={k:"int8" for k in option_columns})

In [3]:
df.shape

(110854, 12638)

In [4]:
df.head()

Unnamed: 0,104_QUAD_8,21285_WATCHDOG,3C515,53C700_BE_BUS,53C700_LE_ON_BE,60XX_WDT,64BIT,6LOWPAN,6LOWPAN_DEBUGFS,6LOWPAN_GHC_EXT_HDR_DEST,...,ZSWAP,ZX2967_PM_DOMAINS,ZX2967_THERMAL,ZX2967_WATCHDOG,ZX_DMA,ZX_I2S,ZX_SPDIF,ZX_TDM,cid,compile_success
0,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,15000,1
1,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,2,1,1,1,15001,1
2,1,0,0,0,0,1,0,2,0,1,...,0,0,2,0,2,2,1,2,15002,1
3,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,1,1,2,2,15003,1
4,1,0,0,0,0,2,0,1,0,1,...,0,1,2,0,1,1,1,1,15004,1


In [5]:
#Only taking configs with cid >= 30000
# df = df[df["cid"] >=30000]

df.query("cid >= 30000", inplace=True)

In [6]:
df.fillna(-1, inplace=True)

In [7]:
df.shape

(96096, 12638)

In [8]:
df.query("(AIC79XX_BUILD_FIRMWARE == 1) | (AIC7XXX_BUILD_FIRMWARE == 1) | (WANXL_BUILD_FIRMWARE == 1)").sort_values(by="cid", ascending=False)[['cid', 'AIC7XXX_BUILD_FIRMWARE', 'AIC79XX_BUILD_FIRMWARE', 'WANXL_BUILD_FIRMWARE']][:10]

Unnamed: 0,cid,AIC7XXX_BUILD_FIRMWARE,AIC79XX_BUILD_FIRMWARE,WANXL_BUILD_FIRMWARE
110841,126719,0,0,1
72428,88301,1,1,0
70440,86309,0,1,1
70307,86176,0,0,1
70302,86171,0,1,0
70291,86160,0,1,0
70269,86138,0,1,0
70233,86102,0,0,1
70214,86083,1,1,0
70213,86082,0,1,0


In [9]:
len(df.query("compile_success == 0").index)

3622

In [10]:
from sklearn.model_selection import train_test_split
from sklearn import tree

In [11]:
import graphviz

def print_tree(clf, f_names, name):
    
    dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=f_names,  
                         filled=True, rounded=True,
                         special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render(name)

In [12]:
from sklearn.tree import _tree
def tree_to_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    rules = []
    #print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, previous_rules):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], previous_rules+[name + " <= " + str(threshold)])
            #print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], previous_rules+[name + " > " + str(threshold)])
        else:
            #print(" & ".join(previous_rules) + " ---> " + str(tree_.value[node]))
            if tree_.value[node][0][0] > tree_.value[node][0][1]:
                rules.append(" & ".join(sorted(previous_rules)))
            #rules.append(" & ".join(previous_rules) + " = " + ("0" if tree_.value[node][0][0] > tree_.value[node][0][1] else "1"))

    recurse(0, [])
    return rules
    

In [13]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix

rulesTree = {}
metrics = {}

for t_size in [0.999,0.995,0.992,0.99, 0.9, 0.5, 0.1]:
    TESTING_SIZE=t_size # 0.001 
    
    shuffle_split = ShuffleSplit(test_size=TESTING_SIZE, n_splits=10)
    
    acc = []
    prec = []
    reca = []
    f1 = []
    balance_acc = []
    spec = []
    
    for train_index, test_index in shuffle_split.split(df.drop(columns="cid"), df["compile_success"]):
        X_train = df.drop(columns="cid").drop(columns="compile_success").iloc[train_index]
        y_train = df["compile_success"].iloc[train_index]
        X_test = df.drop(columns="cid").drop(columns="compile_success").iloc[test_index]
        y_test = df["compile_success"].iloc[test_index]
        
        
        clf = tree.DecisionTreeClassifier(random_state=0)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc.append(accuracy_score (y_test, y_pred))
        prec.append(precision_score (y_test, y_pred))
        reca.append(recall_score (y_test, y_pred))
        f1.append(f1_score (y_test, y_pred))
        balance_acc.append(balanced_accuracy_score (y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec.append(tn/(tn+fp))
    
    metrics[TESTING_SIZE] = {
        "accuracy" : acc,
        "balanced_accuracy" : balance_acc,
        "specificity" : spec
    }
    
    rulesTree[t_size] = tree_to_rules(clf, df.drop(columns="cid").drop(columns="compile_success").columns)

    print("Testing size : {}".format(t_size))
    print("Accuracy score:", pd.Series(acc).mean())
    print("Precision score:", pd.Series(prec).mean())
    print("Recall score:", pd.Series(reca).mean())
    print("F1 score:", pd.Series(f1).mean())
    print("Balance accuracy score:", pd.Series(balance_acc).mean())
    print("Specificity:", pd.Series(spec).mean())
    print_tree(clf, df.drop(columns="cid").drop(columns="compile_success").columns, "tree_"+str(t_size))
    print()

Testing size : 0.999
Accuracy score: 0.9357802083333333
Precision score: 0.9717787895850855
Recall score: 0.9612242175087442
F1 score: 0.9663069175981589
Balance accuracy score: 0.6237079438885476
Specificity: 0.28619167026835096

Testing size : 0.995
Accuracy score: 0.9879685408299865
Precision score: 0.9935791261334785
Recall score: 0.9939257448231309
F1 score: 0.9937435004171575
Balance accuracy score: 0.9149358614768719
Specificity: 0.8359459781306129

Testing size : 0.992
Accuracy score: 0.9881766112789527
Precision score: 0.9953803730221843
Recall score: 0.9923170948173192
F1 score: 0.9938387292295138
Balance accuracy score: 0.9373844250862051
Specificity: 0.882451755355091

Testing size : 0.99
Accuracy score: 0.9892753531786074
Precision score: 0.9949195282005039
Recall score: 0.9939344645434828
F1 score: 0.994420752790359
Balance accuracy score: 0.9321399031962867
Specificity: 0.8703453418490902

Testing size : 0.9
Accuracy score: 0.9962491472706881
Precision score: 0.998506468

In [14]:
for r_list in rulesTree:
    print(len(rulesTree[r_list]))

3
5
5
8
15
29
56


In [15]:
all_rules = []
for r_list in rulesTree:
    all_rules += rulesTree[r_list]

len(all_rules)

121

In [16]:
len(set(all_rules))

107

In [17]:
rules_by_tsize = {}
for r in set(all_rules):
    rules_by_tsize[r] = {r_list:(r in rulesTree[r_list]) for r_list in rulesTree}

In [18]:
def highlight_truefalse(s):
    return ['background-color: green' if v else 'red' for v in s]

In [19]:
dfRules = pd.DataFrame(rules_by_tsize).T.sort_values([0.99,0.9,0.5,0.1])
dfRules.style.apply(highlight_truefalse)

Unnamed: 0,0.1,0.5,0.9,0.99,0.992,0.995,0.999
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & NET_EMATCH_IPSET <= 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,False,False,False,False,False,True,False
COMEDI_DYNA_PCI10XX > 0.5 & PATA_CS5536 > 1.5,False,False,False,False,False,False,True
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & NET_EMATCH_IPSET > 0.5 & USB_NET2280 > 1.5 & WANXL_BUILD_FIRMWARE <= 0.5 & X86_VSYSCALL_EMULATION <= 0.5,False,False,False,False,False,True,False
COMEDI_DYNA_PCI10XX > 0.5 & PATA_CS5536 <= 1.5 & USB_DWC3_EXYNOS > 1.5,False,False,False,False,False,False,True
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & QEDF > 1.5 & WANXL_BUILD_FIRMWARE <= 0.5,False,False,False,False,True,False,False
AIC79XX_BUILD_FIRMWARE > 0.5,False,False,False,False,True,True,False
COMEDI_DYNA_PCI10XX <= 0.5,False,False,False,False,False,False,True
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & DRM_VBOXVIDEO > 1.5 & GENERIC_ALLOCATOR <= 0.5 & QEDF <= 1.5 & WANXL_BUILD_FIRMWARE <= 0.5,False,False,False,False,True,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BRIDGE_EBT_ARPREPLY <= 1.5 & CAN_PEAK_USB > 1.5 & CRC32 > 0.5 & GENERIC_ALLOCATOR > 0.5 & IIO_ST_LSM6DSX_SPI <= 0.5 & SND_SOC_IMX_WM8962 <= 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5 & XEN_PCIDEV_FRONTEND <= 0.5,True,False,False,False,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & CRC32 > 0.5 & GENERIC_ALLOCATOR > 0.5 & MQ_IOSCHED_KYBER > 0.5 & REGMAP_MMIO <= 0.5 & SND_SOC_WM8978 > 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5 & XEN_PCIDEV_FRONTEND > 0.5,True,False,False,False,False,False,False


In [20]:
dfMetrics = pd.DataFrame(metrics)
dfMetrics

Unnamed: 0,0.999,0.995,0.992,0.99,0.9,0.5,0.1
accuracy,"[0.934625, 0.95003125, 0.9356875, 0.93740625, ...","[0.9868641231593038, 0.9887780287817939, 0.991...","[0.9898350956696879, 0.994219956361195, 0.9784...","[0.9786726370669357, 0.9875546585940128, 0.998...","[0.9963462716940118, 0.9967278319284979, 0.995...","[0.9983349983349983, 0.9979187479187479, 0.997...","[0.9976066597294485, 0.9987513007284079, 0.998..."
balanced_accuracy,"[0.7349862109183611, 0.7218049283962162, 0.707...","[0.8696500567139436, 0.9266607190069873, 0.927...","[0.926443385808339, 0.9350895198115932, 0.9287...","[0.9240398508888001, 0.9265681707276532, 0.979...","[0.9788765660935501, 0.979216578462386, 0.9789...","[0.9880097665064671, 0.9881289128729427, 0.990...","[0.9870473842710508, 0.9924791730947239, 0.994..."
specificity,"[0.5190924183729939, 0.4749792645839093, 0.461...","[0.742865059573289, 0.8594834768119967, 0.8578...","[0.8579006141820212, 0.8711383245198998, 0.875...","[0.8649553571428571, 0.8606077502090884, 0.959...","[0.9599876885195445, 0.9602832512315271, 0.960...","[0.9768623024830699, 0.9775342465753425, 0.982...","[0.975609756097561, 0.9857142857142858, 0.9888..."


In [23]:
import matplotlib.pyplot as plt

tsize = ["{0.3f}".format(1-m) for m in metrics]

for metric in ["accuracy","balanced_accuracy","specificity"]:
    data = [metrics[m][metric] for m in metrics]

    plt.boxplot(data, labels=tsize)
    plt.title(metric)
    plt.xlabel('Testing set size')

    plt.show()

AttributeError: 'float' object has no attribute '3f'