In [15]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df1=pd.read_csv("cleaned_with_best_features.csv")
df2=pd.read_csv("test_data.csv")

In [3]:
X_train=df1.drop(columns="class")
Y_train=df1["class"]

In [4]:
X_train.shape, Y_train.shape

((60000, 12), (60000,))

In [5]:
x_test=df2.drop(columns="class")
y_test=df2["class"]

In [6]:
x_test.shape, y_test.shape

((16000, 12), (16000,))

In [7]:
X_train.sample(5)

Unnamed: 0,az_000,az_002,cs_001,ai_000,ba_008,cn_000,cn_008,ag_001,ed_000,dd_000,ec_00,ba_009
55766,1334.0,728.0,434.0,0.0,0.0,0.0,4014.0,0.0,1040.0,1810.0,918.64,0.0
58459,2492.0,1004.0,330.0,0.0,69950.0,0.0,3552.0,0.0,1714.0,1548.0,2.24,5746.0
51029,6396.0,2866.0,520.0,0.0,4.0,0.0,1994.0,0.0,202.0,2212.0,0.98,0.0
3146,4432.0,938.0,1200.0,0.0,0.0,0.0,8116.0,0.0,2980.0,4294.0,2468.2,0.0
47716,3706.0,838.0,1034.0,0.0,50.0,0.0,3086.0,0.0,2022.0,7008.0,2844.0,0.0


In [8]:
x_test.sample(5)

Unnamed: 0,az_000,az_002,cs_001,ai_000,ba_008,cn_000,cn_008,ag_001,ed_000,dd_000,ec_00,ba_009
9976,5022.0,5916.0,104.0,0.0,0.0,0.0,218.0,0.0,284.0,410.0,293.32,0.0
11742,610.0,368.0,164.0,0.0,0.0,0.0,806.0,0.0,408.0,626.0,316.96,0.0
8366,1704.0,280.0,20.0,0.0,0.0,0.0,70.0,0.0,60.0,84.0,93.88,0.0
250,106.0,82.0,16.0,0.0,0.0,0.0,0.0,0.0,54.0,82.0,47.28,0.0
9332,3192.0,1530.0,420.0,0.0,8046.0,0.0,952.0,0.0,1558.0,1308.0,0.9,58.0


In [9]:
# Handeling imballenced dataset 

In [10]:
smote=SMOTE()

In [11]:
x_train,y_train=smote.fit_resample(X_train,Y_train)

In [12]:
# Scaling the data

In [13]:
scale=StandardScaler()

In [14]:
scale.fit(x_train,y_train)

In [15]:
x_train_scaled=pd.DataFrame(scale.transform(x_train), columns=x_train.columns)

In [16]:
x_test_scaled=pd.DataFrame(scale.transform(x_test), columns=x_test.columns)

In [None]:
# Logistic Regression

In [30]:
lr=LogisticRegression()

In [31]:
lr.fit(x_train_scaled,y_train)

In [32]:
lr_pred=lr.predict(x_test_scaled)

In [42]:
def model_evaluation(y_test,y_predict):
    from sklearn.metrics import confusion_matrix, roc_auc_score
    # t : True, f : False, n : Negative, p : Positive
    # .ravel will give ndarray in 1d array 
    tn, fp, fn, tp = confusion_matrix(y_test,y_predict).ravel()
    accuracy = (tp+tn)/(tp+fp+fn+tn)
    recall = (tp)/(tp+fn)
    precision = (tp)/(tp+fp)
    f1 = ( 2 * ( precision * recall ) / ( precision + recall ) )
    specificity = (tn)/(tn+fp)
    roc_auc=roc_auc_score(y_test,y_predict)
    results = {"Accuracy":accuracy,"Recall" : recall, "Precision" : precision,"F1" : f1,
              "Specificity" : specificity, "Roc_Auc_Score":roc_auc}
    return results

In [43]:
model_evaluation(y_test,lr_pred)

{'Accuracy': 0.954,
 'Recall': 0.8613333333333333,
 'Precision': 0.32075471698113206,
 'F1': 0.467438494934877,
 'Specificity': 0.956224,
 'Roc_Auc_Score': 0.9087786666666666}

In [None]:
# Decision Tree 

In [44]:
dt=DecisionTreeClassifier()

In [45]:
dt.fit(x_train,y_train)

In [46]:
dt_pred=dt.predict(x_test)

In [47]:
model_evaluation(y_test,dt_pred)

{'Accuracy': 0.9699375,
 'Recall': 0.6773333333333333,
 'Precision': 0.41368078175895767,
 'F1': 0.5136501516683519,
 'Specificity': 0.97696,
 'Roc_Auc_Score': 0.8271466666666667}

In [None]:
# Post Pruning

In [53]:
path=dt.cost_complexity_pruning_path(x_train_scaled,y_train)
ccp_alpha=path['ccp_alphas']
dt_model=[]
for ccp in ccp_alpha:
    dt=DecisionTreeClassifier(ccp_alpha=ccp)
    dt.fit(x_train_scaled,y_train)
    dt_model.append(dt)
train_score=[i.score(x_train_scaled,y_train) for i in dt_model]
test_score=[i.score(x_test,y_test) for i in dt_model]

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs alpha training and testing sets")
ax.plot(ccp_alpha, train_score, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alpha, test_score, marker='o', label="test",drawstyle="steps-post")
ax.legend()

In [None]:
# Hyper parameter tuning

In [55]:
# Ranom forest

In [69]:
rf=RandomForestClassifier(oob_score=True)

In [70]:
rf.fit(x_train,y_train)

In [71]:
rf_pred=rf.predict(x_test)

In [72]:
model_evaluation(y_test,rf_pred)

{'Accuracy': 0.9785,
 'Recall': 0.7413333333333333,
 'Precision': 0.5295238095238095,
 'F1': 0.6177777777777776,
 'Specificity': 0.984192,
 'Roc_Auc_Score': 0.8627626666666667}

In [None]:
# Hyper parameter tuning

In [None]:
# Support Vector

In [63]:
svc=SVC()

In [65]:
svc.fit(x_train,y_train)

In [66]:
svc_pred=svc.predict(x_test)

In [67]:
model_evaluation(y_test,svc_pred)

{'Accuracy': 0.94175,
 'Recall': 0.896,
 'Precision': 0.27339300244100895,
 'F1': 0.4189526184538653,
 'Specificity': 0.942848,
 'Roc_Auc_Score': 0.919424}

In [68]:
# Hyper parameter tuning

In [None]:
# Ada Boost

In [73]:
ada=AdaBoostClassifier()

In [74]:
ada.fit(x_train,y_train)

In [76]:
ada_pred=ada.predict(x_test)

In [77]:
model_evaluation(y_test,ada_pred)

{'Accuracy': 0.9475,
 'Recall': 0.8853333333333333,
 'Precision': 0.2940655447298494,
 'F1': 0.44148936170212766,
 'Specificity': 0.948992,
 'Roc_Auc_Score': 0.9171626666666667}

In [None]:
# Hyper Parameter Tuning

In [79]:
# Gradient Boosting

In [80]:
gb=GradientBoostingClassifier()

In [81]:
gb.fit(x_train,y_train)

In [83]:
gb_pred=gb.predict(x_test)

In [84]:
model_evaluation(y_test,gb_pred)

{'Accuracy': 0.9555,
 'Recall': 0.9013333333333333,
 'Precision': 0.3336623889437315,
 'F1': 0.48703170028818443,
 'Specificity': 0.9568,
 'Roc_Auc_Score': 0.9290666666666666}

In [93]:
vc=VotingClassifier([('lr', LogisticRegression(max_iter=5000)), ('gb', gb), ('ada', ada),('rf',rf)],voting="soft")

In [94]:
vc.fit(x_train, y_train)

In [95]:
vc_pred=vc.predict(x_test)

In [96]:
model_evaluation(y_test, vc_pred)

{'Accuracy': 0.9625625,
 'Recall': 0.888,
 'Precision': 0.3741573033707865,
 'F1': 0.5264822134387352,
 'Specificity': 0.964352,
 'Roc_Auc_Score': 0.9261759999999999}