In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

dataset='ObsoleteLayoutParam.csv'
figsize=(12,6)
bag=5
threshold=5

df = pd.read_csv('../datasets/' + dataset)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4950 entries, 0 to 4949
Data columns (total 9 columns):
EGAP(s)                 4950 non-null object
# EGAPs fixed           4950 non-null int64
# Hits on fixed code    4950 non-null int64
Energy (before fix)     4950 non-null float64
Energy (after fix)      4950 non-null float64
Time (before fix)       4950 non-null float64
Time (after fix)        4950 non-null float64
# Total method calls    4950 non-null int64
# Diff. method calls    4950 non-null int64
dtypes: float64(4), int64(4), object(1)
memory usage: 348.1+ KB


In [2]:
def downcast_dtypes(df):
    df_int = df.select_dtypes(include=['int'])
    df_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[df_int.columns] = df_int

    df_float = df.select_dtypes(include=['float'])
    df_float = df_float.apply(pd.to_numeric, downcast='float')
    df[df_float.columns] = df_float
    
    return df

df = downcast_dtypes(df)
df = df.drop([df.columns[0]], axis=1)
df = df.dropna()

In [3]:
def strip_colnames(cols):
    cols = cols.str.lower()
    cols = cols.str.replace('#', '', regex=False)
    cols = cols.str.replace('[().]', '')
    cols = cols.str.strip()
    cols = cols.str.replace(' ', '_', regex=False)

    return cols

df.columns = strip_colnames(df.columns)

In [4]:
df['result'] = (df['energy_after_fix'] * 100 / df['energy_before_fix']) <= (100 - threshold)

In [5]:
features = ['egaps_fixed', 'hits_on_fixed_code', 'total_method_calls', 'diff_method_calls']

X = df[features]
y = df.result

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('DecisionTree Accuracy:', metrics.accuracy_score(y_test, y_pred))

DecisionTree Accuracy: 0.9218855218855219


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('RandomForest Accuracy:', metrics.accuracy_score(y_test, y_pred))

RandomForest Accuracy: 0.927946127946128
