In [None]:
# this file needs to be in the same directory as "cleaned_K8.csv"
# assuming we've already run clean_data.py: run with "python3 driver.py"
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
import knn
import svm
import logistic_regression

ModuleNotFoundError: No module named 'knn'

In [None]:
df = pd.read_csv("cleaned_K8.csv", header = None, low_memory = False)  # process the data with pandas

In [None]:
data = sns.countplot(df[5408])
data.set_xticklabels([0,1])
plt.title('data distribution')
plt.xlabel('labels')
plt.ylabel('counts')
plt.legend()
plt.show()

### Oversampling Technique

In [None]:
class_count_0, class_count_1 = df[5408].value_counts()

In [None]:
print(class_count_0, class_count_1)

In [None]:
# Separate class
class_0 = df[df[5408] == 0]
class_1 = df[df[5408] == 1]# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

In [None]:
class_1_over = class_1.sample(class_count_0, replace=True)

test_over = pd.concat([class_1_over, class_0], axis=0)

test_over[5408].value_counts().plot(kind='bar', title='count (target)')

In [None]:
test_over.head()

In [None]:
acc_svm = SVM(test_over,0.7)
print(acc_svm)

In [None]:
size_train = int(0.7 * len(test_over))
df_train, df_test = test_over.head(size_train), test_over.iloc[size_train:,:]
    
# we can add class_weight='balanced' to add panalize mistake
svc_model = SVC()
X_train, y_train = df_train.iloc[:,:df_train.shape[1] - 1], df_train.iloc[:,df_train.shape[1] - 1]
X_test, y_test = df_test.iloc[:,:df_test.shape[1] - 1], df_test.iloc[:,df_test.shape[1] - 1]
svc_model.fit(X_train, y_train)

svc_predict = svc_model.predict(X_test)# check performance

In [None]:
print('ROCAUC score:',roc_auc_score(y_test, svc_predict))
print('Accuracy score:',accuracy_score(y_test, svc_predict))
print('F1 score:',f1_score(y_test, svc_predict))

### Penalized SVM technique

In [None]:
size_train = int(0.7 * len(df))
df_train, df_test = df.head(size_train), df.iloc[size_train:,:]
    
# we can add class_weight='balanced' to add panalize mistake
svc_model = SVC(class_weight='balanced', probability=True)
X_train, y_train = df_train.iloc[:,:df_train.shape[1] - 1], df_train.iloc[:,df_train.shape[1] - 1]
X_test, y_test = df_test.iloc[:,:df_test.shape[1] - 1], df_test.iloc[:,df_test.shape[1] - 1]
svc_model.fit(X_train, y_train)

svc_predict = svc_model.predict(X_test)# check performance

In [None]:
print('ROCAUC score:',roc_auc_score(y_test, svc_predict))
print('Accuracy score:',accuracy_score(y_test, svc_predict))
print('F1 score:',f1_score(y_test, svc_predict))

### SVM --> RandomForest

In [None]:
rfc = RandomForestClassifier()

# fit the predictor and target
rfc.fit(X_train, y_train)

# predict
rfc_predict = rfc.predict(X_test)# check performance
print('ROCAUC score:',roc_auc_score(y_test, rfc_predict))
print('Accuracy score:',accuracy_score(y_test, rfc_predict))
print('F1 score:',f1_score(y_test, rfc_predict))

In [None]:
fig = plt.figure(figsize=(100, 100))
plot_tree(rfc.estimators_[0], 
          feature_names=df.columns,
          class_names=["active","inactive"], 
          filled=True, impurity=True, 
          rounded=True)


In [None]:
fig.savefig('figure_name.png')