In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

dataset='ObsoleteLayoutParam.csv'
figsize=(12,6)
bag=5
threshold=5

df = pd.read_csv('../datasets/' + dataset)

In [2]:
def downcast_dtypes(df):
    df_int = df.select_dtypes(include=['int'])
    df_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[df_int.columns] = df_int

    df_float = df.select_dtypes(include=['float'])
    df_float = df_float.apply(pd.to_numeric, downcast='float')
    df[df_float.columns] = df_float
    
    return df

df = downcast_dtypes(df)
df = df.drop([df.columns[0]], axis=1)
df = df.dropna()

In [3]:
def strip_colnames(cols):
    cols = cols.str.lower()
    cols = cols.str.replace('#', '', regex=False)
    cols = cols.str.replace('[().]', '')
    cols = cols.str.strip()
    cols = cols.str.replace(' ', '_', regex=False)

    return cols

df.columns = strip_colnames(df.columns)

In [4]:
df['result'] = (df['energy_after_fix'] * 100 / df['energy_before_fix']) <= (100 - threshold)

In [5]:
features = ['egaps_fixed', 'hits_on_fixed_code', 'total_method_calls', 'diff_method_calls']

X = df[features]
y = df.result

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

0.9151515151515152


In [7]:
metrics.recall_score(y_test, y_pred)

0.8090128755364807

In [8]:
from sklearn.tree import export_graphviz
export_graphviz(clf, out_file='tree.dot', feature_names = features,
                class_names = ['Loss', 'Gain'],
                rounded = True, proportion = False, precision = 2, filled = True)