In [1]:
import pandas as pd
import json

In [2]:
with open("option_columns.json","r") as f:
    option_columns = json.load(f)

#Find the dataset
df = pd.read_csv("../tuxml-datasets-copie/dataset_encoded.csv", dtype={k:"int8" for k in option_columns})

In [3]:
df.shape

(110854, 12638)

In [4]:
df.head()

Unnamed: 0,104_QUAD_8,21285_WATCHDOG,3C515,53C700_BE_BUS,53C700_LE_ON_BE,60XX_WDT,64BIT,6LOWPAN,6LOWPAN_DEBUGFS,6LOWPAN_GHC_EXT_HDR_DEST,...,ZSWAP,ZX2967_PM_DOMAINS,ZX2967_THERMAL,ZX2967_WATCHDOG,ZX_DMA,ZX_I2S,ZX_SPDIF,ZX_TDM,cid,compile_success
0,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,15000,1
1,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,2,1,1,1,15001,1
2,1,0,0,0,0,1,0,2,0,1,...,0,0,2,0,2,2,1,2,15002,1
3,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,1,1,2,2,15003,1
4,1,0,0,0,0,2,0,1,0,1,...,0,1,2,0,1,1,1,1,15004,1


In [5]:
#Only taking configs with cid >= 30000
df = df[df["cid"] >=30000]

In [6]:
df.fillna(-1, inplace=True)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn import tree

In [9]:
import graphviz

def print_tree(clf, f_names, name):
    
    dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=f_names,  
                         filled=True, rounded=True,
                         special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render(name)

In [13]:
from sklearn.tree import _tree
def tree_to_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    rules = []
    #print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, previous_rules):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], previous_rules+[name + " <= " + str(threshold)])
            #print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], previous_rules+[name + " > " + str(threshold)])
        else:
            #print(" & ".join(previous_rules) + " ---> " + str(tree_.value[node]))
            if tree_.value[node][0][0] > tree_.value[node][0][1]:
                rules.append(" & ".join(sorted(previous_rules)))
            #rules.append(" & ".join(previous_rules) + " = " + ("0" if tree_.value[node][0][0] > tree_.value[node][0][1] else "1"))

    recurse(0, [])
    return rules
    

In [14]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix

rulesTree = {}
for t_size in [0.99, 0.9, 0.5, 0.1]:
    TESTING_SIZE=t_size # 0.001 
    
    shuffle_split = ShuffleSplit(test_size=TESTING_SIZE, n_splits=10)
    
    acc = []
    prec = []
    reca = []
    f1 = []
    balance_acc = []
    spec = []
    
    for train_index, test_index in shuffle_split.split(df.drop(columns="cid"), df["compile_success"]):
        X_train = df.drop(columns="cid").drop(columns="compile_success").iloc[train_index]
        y_train = df["compile_success"].iloc[train_index]
        X_test = df.drop(columns="cid").drop(columns="compile_success").iloc[test_index]
        y_test = df["compile_success"].iloc[test_index]
        
        
        clf = tree.DecisionTreeClassifier(random_state=0)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc.append(accuracy_score (y_test, y_pred))
        prec.append(precision_score (y_test, y_pred))
        reca.append(recall_score (y_test, y_pred))
        f1.append(f1_score (y_test, y_pred))
        balance_acc.append(balanced_accuracy_score (y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec.append(tn/(tn+fp))

    rulesTree[t_size] = tree_to_rules(clf, df.drop(columns="cid").drop(columns="compile_success").columns)

    print("Testing size : {}".format(t_size))
    print("Accuracy score:", pd.Series(acc).mean())
    print("Precision score:", pd.Series(prec).mean())
    print("Recall score:", pd.Series(reca).mean())
    print("F1 score:", pd.Series(f1).mean())
    print("Balance accuracy score:", pd.Series(balance_acc).mean())
    print("Specificity:", pd.Series(spec).mean())
    print_tree(clf, df.drop(columns="cid").drop(columns="compile_success").columns, "tree_"+str(t_size))
    print()

Testing size : 0.99
Accuracy score: 0.9903317356205852
Precision score: 0.995326273072439
Recall score: 0.9946224752875894
F1 score: 0.9949736563803601
Balance accuracy score: 0.9377384719661975
Specificity: 0.8808544686448057

Testing size : 0.9
Accuracy score: 0.9962653346745753
Precision score: 0.9985123198234016
Recall score: 0.9976052895513933
F1 score: 0.9980585287603141
Balance accuracy score: 0.9798380423876164
Specificity: 0.9620707952238396

Testing size : 0.5
Accuracy score: 0.9979666167166169
Precision score: 0.9991452237606175
Recall score: 0.998741121454449
F1 score: 0.9989431048251337
Balance accuracy score: 0.9885105373558059
Specificity: 0.978279953257163

Testing size : 0.1
Accuracy score: 0.9985015608740895
Precision score: 0.9993836686601476
Recall score: 0.9990597893885352
F1 score: 0.999221667108894
Balance accuracy score: 0.9915652959106449
Specificity: 0.9840708024327547



In [15]:
for r_list in rulesTree:
    print(len(rulesTree[r_list]))

6
13
41
51


In [16]:
all_rules = []
for r_list in rulesTree:
    all_rules += rulesTree[r_list]

len(all_rules)

111

In [17]:
len(set(all_rules))

101

In [18]:
rules_by_tsize = {}
for r in set(all_rules):
    rules_by_tsize[r] = {r_list:(r in rulesTree[r_list]) for r_list in rulesTree}

In [19]:
def highlight_truefalse(s):
    return ['background-color: green' if v else 'red' for v in s]

In [20]:
dfRules = pd.DataFrame(rules_by_tsize).T.sort_values([0.99,0.9,0.5,0.1])
dfRules.style.apply(highlight_truefalse)

Unnamed: 0,0.1,0.5,0.9,0.99
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & CRC32 > 0.5 & DRM_PANEL_SHARP_LS043T1LE01 <= 1.5 & DVB_NETUP_UNIDVB > 1.5 & GENERIC_ALLOCATOR > 0.5 & REGMAP_MMIO <= 0.5 & SND_SOC_MSM8916_WCD_DIGITAL <= 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & USB_SERIAL_VISOR <= 0.5 & VIDEO_OV5645 > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BT_HCIUART_NOKIA <= 1.5 & CRC32 > 0.5 & DRM_I915 <= 1.5 & GENERIC_ALLOCATOR > 0.5 & I2C_DESIGNWARE_SLAVE <= 0.5 & MQ_IOSCHED_KYBER <= 0.5 & NLS > 1.5 & PWRSEQ_SD8787 <= 1.5 & REGMAP_MMIO > 0.5 & SND_PDAUDIOCF <= 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & USB_DWC3_ST > 1.5 & VIDEO_ATOMISP > 0.5 & VIDEO_LM3560 > 1.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BRIDGE_EBT_NFLOG <= 1.5 & CRC32 > 0.5 & GENERIC_ALLOCATOR > 0.5 & HOTPLUG_PCI_CPCI_GENERIC > 0.5 & I2C_EG20T <= 0.5 & MQ_IOSCHED_KYBER > 0.5 & NFC_PN544_I2C <= 1.5 & NIC7018_WDT <= 0.5 & REGMAP_MMIO > 0.5 & UBSAN_SANITIZE_ALL <= 0.5 & USB_SL811_CS > 1.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & COMMON_CLK_XLNX_CLKWZRD > 1.5 & CRC32 > 0.5 & GENERIC_ALLOCATOR > 0.5 & HOTPLUG_PCI_CPCI_GENERIC <= 0.5 & MQ_IOSCHED_KYBER > 0.5 & REGMAP_MMIO > 0.5 & SND_SOC_INTEL_BXT_RT298_MACH > 1.5 & TYPEC_WCOVE <= 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & DRM_VBOXVIDEO <= 1.5 & GENERIC_ALLOCATOR <= 0.5 & INFINIBAND_SRPT > 1.5 & SCSI_SRP_ATTRS <= 1.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BT_HCIUART_NOKIA > 1.5 & CRC32 > 0.5 & DRM_I915 <= 1.5 & GENERIC_ALLOCATOR > 0.5 & MQ_IOSCHED_KYBER <= 0.5 & PWRSEQ_SD8787 <= 1.5 & REGMAP_MMIO > 0.5 & REGULATOR_MC13783 <= 0.5 & SND_PDAUDIOCF <= 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BLK_DEV_SIIMAGE > 1.5 & BRIDGE_EBT_NFLOG <= 1.5 & CRC32 > 0.5 & GENERIC_ALLOCATOR > 0.5 & HOTPLUG_PCI_CPCI_GENERIC > 0.5 & I2C_EG20T > 0.5 & INFINIBAND_QEDR <= 1.5 & MQ_IOSCHED_KYBER > 0.5 & NLS <= 1.5 & PATA_ACPI > 0.5 & REGMAP_MMIO > 0.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5 & WW_MUTEX_SELFTEST > 1.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & BATTERY_TWL4030_MADC > 1.5 & BRIDGE_EBT_NFLOG <= 1.5 & CHARGER_AXP20X <= 1.5 & COMEDI_C6XDIGIO > 1.5 & CRC32 > 0.5 & DVB_BUDGET <= 1.5 & GENERIC_ALLOCATOR > 0.5 & HOTPLUG_PCI_CPCI_GENERIC > 0.5 & I2C_EG20T <= 0.5 & MQ_IOSCHED_KYBER > 0.5 & NFC_PN544_I2C <= 1.5 & REGMAP_MMIO > 0.5 & UBSAN_SANITIZE_ALL <= 0.5 & USB_SL811_CS <= 1.5 & VIDEO_RCAR_VIN > 0.5 & VIDEO_S5P_MIPI_CSIS > 0.5 & VIDEO_XILINX_VTC > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & FORTIFY_SOURCE > 0.5 & GENERIC_ALLOCATOR > 0.5 & INFINIBAND_ADDR_TRANS > 0.5 & IPV6 > 1.5 & PTP_1588_CLOCK_KVM > 1.5 & UBSAN_SANITIZE_ALL > 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
AIC79XX_BUILD_FIRMWARE <= 0.5 & AIC7XXX_BUILD_FIRMWARE <= 0.5 & GENERIC_ALLOCATOR > 0.5 & SND_SOC_RK3399_GRU_SOUND > 1.5 & UBSAN_SANITIZE_ALL <= 0.5 & VIDEO_MUX <= 1.5 & VIDEO_XILINX_VTC <= 0.5 & WANXL_BUILD_FIRMWARE <= 0.5,True,False,False,False
