# import libraries

In [98]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# read data

In [4]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

In [58]:
print(len(train))
print(len(test))

97470
24368


In [5]:
x_train = train[['dt_1', 'sd_1', 'curr_1', 'dt_2', 'sd_2', 'curr_2', 'dt_3', 'sd_3', 'curr_3']]
y_train = train[['label']]

In [6]:
x_test = test[['dt_1', 'sd_1', 'curr_1', 'dt_2', 'sd_2', 'curr_2', 'dt_3', 'sd_3', 'curr_3']]
y_test = test[['label']]

In [7]:
print(x_train)
print(y_train)

            dt_1      sd_1      curr_1      dt_2      sd_2      curr_2  \
111580  0.008783  8.101429  120.785714  0.006820  2.735357   97.360714   
69119   0.008210  5.084660  109.573298  0.007052  3.294262   99.619372   
26583   0.007401  3.407595  117.047619  0.008130  7.191667  117.714286   
25994   0.006635  4.596190  125.380952  0.008313  3.578095  135.000000   
97869   0.008859  3.481591  121.795455  0.007250  9.245909  119.340909   
...          ...       ...         ...       ...       ...         ...   
55198   0.007220  4.180106  102.797872  0.007915  6.224894  123.680851   
66396   0.010214  3.330773  108.331395  0.009082  3.186163  107.141279   
8813    0.007950  3.633457  105.602469  0.007768  8.373086  118.024691   
79926   0.007546  3.304286  123.244898  0.009403  7.121633  126.591837   
80075   0.009732  3.202821  124.487179  0.009788  6.491282  126.358974   

            dt_3      sd_3     curr_3  
111580  0.005948  2.179286  88.953571  
69119   0.010659  2.407592  91.

# original dataset

## logistic regression

In [97]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [100]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23273     0]
 [ 1095     0]]
accuracy: 0.9551
roc auc: 0.7082
pr auc: 0.0893


## decision tree

In [10]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [11]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[22443   830]
 [  794   301]]
accuracy: 0.9334
roc auc: 0.6196
pr auc: 0.2868


## random forest

In [15]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [18]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23143   130]
 [  858   237]]
accuracy: 0.9595
roc auc: 0.8744
pr auc: 0.4131


## xgboost

In [12]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [13]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23095   178]
 [  833   262]]
accuracy: 0.9585
roc auc: 0.885
pr auc: 0.3879


# scaled dataset
-- attempted both StandardScaler and MinMaxScaler, no significant improvement in performances observed

In [76]:
scaler = StandardScaler()
# scaler = MinMaxScaler()

In [77]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

## logistic regression

In [78]:
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [79]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test_scaled))}')
print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23273     0]
 [ 1095     0]]
accuracy: 0.9551
roc auc: 0.7162
pr auc: 0.0981


## decision tree

In [80]:
dt = DecisionTreeClassifier()
dt.fit(x_train_scaled, y_train)

In [81]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test_scaled))}')
print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[22465   808]
 [  794   301]]
accuracy: 0.9343
roc auc: 0.6201
pr auc: 0.2894


## random forest

In [82]:
rf = RandomForestClassifier()
rf.fit(x_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [83]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test_scaled))}')
print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23140   133]
 [  851   244]]
accuracy: 0.9596
roc auc: 0.8775
pr auc: 0.4087


## xgboost

In [84]:
xgb = XGBClassifier()
xgb.fit(x_train_scaled, y_train)

In [85]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test_scaled))}')
print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test_scaled)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test_scaled)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test_scaled)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[23091   182]
 [  823   272]]
accuracy: 0.9588
roc auc: 0.884
pr auc: 0.3955


# oversampling (class 1)
-- no significant improvements for default params, mainly increase in AUC values (both ROC and PR) for decision tree, but in general lower accuracy observed across all 4 models

-- also tested ADASYN (different oversampling technique), similar results obtained but slightly worse than SMOTE

In [86]:
Counter(y_train.label)

Counter({0: 93090, 1: 4380})

In [87]:
# sm = SMOTE()
sm = ADASYN()
x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

In [88]:
Counter(y_train_sm.label)

Counter({1: 93564, 0: 93090})

## logistic regression

In [89]:
lr = LogisticRegression()
lr.fit(x_train_sm, y_train_sm)

  y = column_or_1d(y, warn=True)


In [90]:
print(f'confusion matrix:\n {confusion_matrix(y_test, lr.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, lr.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, lr.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, lr.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[13627  9646]
 [  323   772]]
accuracy: 0.5909
roc auc: 0.7012
pr auc: 0.0801


## decision tree

In [91]:
dt = DecisionTreeClassifier()
dt.fit(x_train_sm, y_train_sm)

In [92]:
print(f'confusion matrix:\n {confusion_matrix(y_test, dt.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, dt.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, dt.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, dt.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[20988  2285]
 [  550   545]]
accuracy: 0.8837
roc auc: 0.6998
pr auc: 0.3564


## random forest

In [93]:
rf = RandomForestClassifier()
rf.fit(x_train_sm, y_train_sm)

  return fit_method(estimator, *args, **kwargs)


In [94]:
print(f'confusion matrix:\n {confusion_matrix(y_test, rf.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, rf.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, rf.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, rf.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[22237  1036]
 [  510   585]]
accuracy: 0.9366
roc auc: 0.8913
pr auc: 0.3817


## xgboost

In [95]:
xgb = XGBClassifier()
xgb.fit(x_train_sm, y_train_sm)

In [96]:
print(f'confusion matrix:\n {confusion_matrix(y_test, xgb.predict(x_test))}')
print(f'accuracy: {round(accuracy_score(y_test, xgb.predict(x_test)),4)}')
print(f'roc auc: {round(roc_auc_score(y_test, xgb.predict_proba(x_test)[:,1]),4)}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb.predict_proba(x_test)[:,1])
print(f'pr auc: {round(auc(recall, precision),4)}')

confusion matrix:
 [[19662  3611]
 [  288   807]]
accuracy: 0.84
roc auc: 0.8767
pr auc: 0.3738


# end