In [1]:
import pandas as pd
import json

In [2]:
with open("option_columns.json","r") as f:
    option_columns = json.load(f)

#Find the dataset
# df = pd.read_csv("../tuxml-datasets-copie/dataset_encoded.csv", dtype={k:"int8" for k in option_columns})
df = pd.read_csv("dataset_encoded.csv", dtype={k:"int8" for k in option_columns})

In [3]:
df.shape

(110854, 12638)

In [4]:
df.head()

Unnamed: 0,104_QUAD_8,21285_WATCHDOG,3C515,53C700_BE_BUS,53C700_LE_ON_BE,60XX_WDT,64BIT,6LOWPAN,6LOWPAN_DEBUGFS,6LOWPAN_GHC_EXT_HDR_DEST,...,ZSWAP,ZX2967_PM_DOMAINS,ZX2967_THERMAL,ZX2967_WATCHDOG,ZX_DMA,ZX_I2S,ZX_SPDIF,ZX_TDM,cid,compile_success
0,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,15000,1
1,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,2,1,1,1,15001,1
2,1,0,0,0,0,1,0,2,0,1,...,0,0,2,0,2,2,1,2,15002,1
3,1,0,0,0,0,2,0,1,0,1,...,0,0,2,0,1,1,2,2,15003,1
4,1,0,0,0,0,2,0,1,0,1,...,0,1,2,0,1,1,1,1,15004,1


In [5]:
#Only taking configs with cid >= 30000
# df = df[df["cid"] >=30000]

df.query("cid >= 30000", inplace=True)

In [6]:
df.fillna(-1, inplace=True)

In [7]:
df.shape

(96096, 12638)

In [28]:
df.query("(AIC79XX_BUILD_FIRMWARE == 1) | (AIC7XXX_BUILD_FIRMWARE == 1) | (WANXL_BUILD_FIRMWARE == 1)").sort_values(by="cid", ascending=False)[['cid', 'AIC7XXX_BUILD_FIRMWARE', 'AIC79XX_BUILD_FIRMWARE', 'WANXL_BUILD_FIRMWARE']][:10]

Unnamed: 0,cid,AIC7XXX_BUILD_FIRMWARE,AIC79XX_BUILD_FIRMWARE,WANXL_BUILD_FIRMWARE
110841,126719,0,0,1
72428,88301,1,1,0
70440,86309,0,1,1
70307,86176,0,0,1
70302,86171,0,1,0
70291,86160,0,1,0
70269,86138,0,1,0
70233,86102,0,0,1
70214,86083,1,1,0
70213,86082,0,1,0


In [32]:
len(df.query("compile_success == 0").index)

KeyError: 'time'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree

In [None]:
import graphviz

def print_tree(clf, f_names, name):
    
    dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=f_names,  
                         filled=True, rounded=True,
                         special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render(name)

In [None]:
from sklearn.tree import _tree
def tree_to_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    rules = []
    #print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, previous_rules):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], previous_rules+[name + " <= " + str(threshold)])
            #print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], previous_rules+[name + " > " + str(threshold)])
        else:
            #print(" & ".join(previous_rules) + " ---> " + str(tree_.value[node]))
            if tree_.value[node][0][0] > tree_.value[node][0][1]:
                rules.append(" & ".join(sorted(previous_rules)))
            #rules.append(" & ".join(previous_rules) + " = " + ("0" if tree_.value[node][0][0] > tree_.value[node][0][1] else "1"))

    recurse(0, [])
    return rules
    

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix

rulesTree = {}
for t_size in [0.99, 0.9, 0.5, 0.1]:
    TESTING_SIZE=t_size # 0.001 
    
    shuffle_split = ShuffleSplit(test_size=TESTING_SIZE, n_splits=10)
    
    acc = []
    prec = []
    reca = []
    f1 = []
    balance_acc = []
    spec = []
    
    for train_index, test_index in shuffle_split.split(df.drop(columns="cid"), df["compile_success"]):
        X_train = df.drop(columns="cid").drop(columns="compile_success").iloc[train_index]
        y_train = df["compile_success"].iloc[train_index]
        X_test = df.drop(columns="cid").drop(columns="compile_success").iloc[test_index]
        y_test = df["compile_success"].iloc[test_index]
        
        
        clf = tree.DecisionTreeClassifier(random_state=0)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc.append(accuracy_score (y_test, y_pred))
        prec.append(precision_score (y_test, y_pred))
        reca.append(recall_score (y_test, y_pred))
        f1.append(f1_score (y_test, y_pred))
        balance_acc.append(balanced_accuracy_score (y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec.append(tn/(tn+fp))

    rulesTree[t_size] = tree_to_rules(clf, df.drop(columns="cid").drop(columns="compile_success").columns)

    print("Testing size : {}".format(t_size))
    print("Accuracy score:", pd.Series(acc).mean())
    print("Precision score:", pd.Series(prec).mean())
    print("Recall score:", pd.Series(reca).mean())
    print("F1 score:", pd.Series(f1).mean())
    print("Balance accuracy score:", pd.Series(balance_acc).mean())
    print("Specificity:", pd.Series(spec).mean())
    print_tree(clf, df.drop(columns="cid").drop(columns="compile_success").columns, "tree_"+str(t_size))
    print()

In [None]:
for r_list in rulesTree:
    print(len(rulesTree[r_list]))

In [None]:
all_rules = []
for r_list in rulesTree:
    all_rules += rulesTree[r_list]

len(all_rules)

In [None]:
len(set(all_rules))

In [None]:
rules_by_tsize = {}
for r in set(all_rules):
    rules_by_tsize[r] = {r_list:(r in rulesTree[r_list]) for r_list in rulesTree}

In [None]:
def highlight_truefalse(s):
    return ['background-color: green' if v else 'red' for v in s]

In [None]:
dfRules = pd.DataFrame(rules_by_tsize).T.sort_values([0.99,0.9,0.5,0.1])
dfRules.style.apply(highlight_truefalse)