# More Models

In [59]:
# Import base libraries
import pandas as pd
import numpy as np
from scipy.io import arff

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.utils import class_weight

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras import layers
from keras import models
from keras import optimizers
from keras import regularizers

#from functions import *

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load data
data3 = arff.loadarff('../data/3year.arff')
df3 = pd.DataFrame(data3[0])

# Change label/class type to binary
df3['class'] = df3['class'].astype('int64')

df3.shape

(10503, 65)

In [48]:
df3.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
0,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,0.46359,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,0
1,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,0.53962,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,0
2,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,0.67566,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,0
3,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,0.56453,...,0.048398,0.043445,0.9516,0.14298,4.2286,5.0528,98.783,3.695,3.4844,0
4,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,0.58496,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,0


### Remove 14 Attributes

In [49]:
# Clean, remove 14 rows

columns_to_delete = ['Attr37', 'Attr21', 'Attr27', 'Attr60', 'Attr45', 'Attr54', 'Attr64', 
                     'Attr53', 'Attr28', 'Attr24', 'Attr41', 'Attr32', 'Attr52', 'Attr47']

df3_c1 = df3.drop(columns_to_delete, axis=1)
df3_c1 = df3_c1[df3_c1.isnull().sum(axis=1) < 4]
df3_c1 = df3_c1.fillna(df3_c1.median())

In [50]:
df3_c1.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr51,Attr55,Attr56,Attr57,Attr58,Attr59,Attr61,Attr62,Attr63,class
0,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,0.46359,...,0.41299,127280.0,0.16396,0.37574,0.83604,7e-06,6.2813,84.291,4.3303,0
1,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,0.53962,...,0.44849,3387.8,0.027516,0.271,0.90108,0.0,4.1103,102.19,3.5716,0
2,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,0.67566,...,0.22612,20453.0,0.007639,0.000881,0.99236,0.0,3.7922,64.846,5.6287,0
3,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,0.56453,...,0.35164,5012.6,0.048398,0.043445,0.9516,0.14298,5.0528,98.783,3.695,0
4,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,0.58496,...,0.36891,13730.0,0.17648,0.32188,0.82635,0.073039,7.0756,100.54,3.6303,0


### Remove 16 Attributes

In [51]:
# Clean, remove 16 rows

columns_to_delete = ['Attr37', 'Attr21', 'Attr27', 'Attr60', 'Attr45', 'Attr54', 'Attr64', 
                     'Attr53', 'Attr28', 'Attr24', 'Attr41', 'Attr32', 'Attr52', 'Attr47',
                    'Attr5', 'Attr61']

df3_c2 = df3.drop(columns_to_delete, axis=1)
df3_c2 = df3_c2[df3_c2.isnull().sum(axis=1) < 4]
df3_c2 = df3_c2.fillna(df3_c2.median())

In [52]:
df3_c2.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr6,Attr7,Attr8,Attr9,Attr10,Attr11,...,Attr50,Attr51,Attr55,Attr56,Attr57,Attr58,Attr59,Attr62,Attr63,class
0,0.17419,0.41299,0.14371,1.348,0.60383,0.21946,1.1225,1.1961,0.46359,0.21946,...,1.348,0.41299,127280.0,0.16396,0.37574,0.83604,7e-06,84.291,4.3303,0
1,0.14624,0.46038,0.2823,1.6294,0.0,0.17185,1.1721,1.6018,0.53962,0.17579,...,1.5874,0.44849,3387.8,0.027516,0.271,0.90108,0.0,102.19,3.5716,0
2,0.000595,0.22612,0.48839,3.1599,0.19114,0.004572,2.9881,1.0077,0.67566,0.004572,...,3.1599,0.22612,20453.0,0.007639,0.000881,0.99236,0.0,64.846,5.6287,0
3,0.024526,0.43236,0.27546,1.7833,0.56944,0.024526,1.3057,1.0509,0.56453,0.024526,...,1.4504,0.35164,5012.6,0.048398,0.043445,0.9516,0.14298,98.783,3.695,0
4,0.18829,0.41504,0.34231,1.9279,0.0,0.23358,1.4094,1.3393,0.58496,0.23881,...,1.7136,0.36891,13730.0,0.17648,0.32188,0.82635,0.073039,100.54,3.6303,0


## XGBoost Final Model with 50 attributes

In [45]:
# Assign target and predictor
y = df3_c1['class']
X = df3_c1.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

X_train shape =  (8356, 50)
y_train shape =  (8356,)
X_test shape =  (2089, 50)
y_test shape =  (2089,)


In [46]:
#Model 7:

xgbParams = {
    'eval_metric': 'logloss', 
    'random_state': 42,
    'scale_pos_weight': 20,
    'n_estimators': 125, 
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 0,
    'learning_rate': 0.20,
    'max_delta_step': 0,
    'reg_lambda': 0,
    'reg_alpha': 5,
    'subsample': 1,
    'colsample_bytree': 0.7
}

In [47]:
#Run 

weigths_train = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

clf = XGBClassifier(**xgbParams) 
clf.fit(X_train, y_train, sample_weight=weigths_train) 
    
print('Training Data:\n', classification_report(y_train, clf.predict(X_train)))
print('Testing Data:\n', classification_report(y_test, clf.predict(X_test)))


Training Data:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98      7961
           1       0.51      1.00      0.68       395

    accuracy                           0.96      8356
   macro avg       0.76      0.98      0.83      8356
weighted avg       0.98      0.96      0.96      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.97      0.92      0.94      1993
           1       0.22      0.50      0.31        96

    accuracy                           0.90      2089
   macro avg       0.60      0.71      0.63      2089
weighted avg       0.94      0.90      0.92      2089



## XGBoost Final Model with 48 attributes

In [None]:
# Assign target and predictor
y = df3_c2['class']
X = df3_c2.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

In [25]:
#Run

weigths_train = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

clf = XGBClassifier(**xgbParams) 
clf.fit(X_train, y_train, sample_weight=weigths_train) 
    
print('Training Data:\n', classification_report(y_train, clf.predict(X_train)))
print('Testing Data:\n', classification_report(y_test, clf.predict(X_test)))


Training Data:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      7961
           1       0.50      1.00      0.67       395

    accuracy                           0.95      8356
   macro avg       0.75      0.98      0.82      8356
weighted avg       0.98      0.95      0.96      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94      1993
           1       0.20      0.51      0.29        96

    accuracy                           0.88      2089
   macro avg       0.59      0.71      0.61      2089
weighted avg       0.94      0.88      0.91      2089



## Neural Network with 50 attributes

In [53]:
# Assign target and predictor
y = df3_c1['class']
X = df3_c1.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

X_train shape =  (8356, 50)
y_train shape =  (8356,)
X_test shape =  (2089, 50)
y_test shape =  (2089,)


In [54]:
model = Sequential()
model.add(Dense(50, activation='relu', input_shape=(50,)))
#model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.005), input_shape=(48,)))
model.add(Dense(20, activation='relu'))
#model.add(Dense(20, activation='relu', kernel_regularizer=regularizers.l2(0.005)))
model.add(Dense(1, activation='sigmoid'))

In [56]:
model.compile(loss='binary_crossentropy',
                optimizer='sgd',
                metrics=['acc'])

history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    batch_size=8356,
                    validation_split=0.20,
                    verbose=False
                    )

y_train_pred = np.round(model.predict(X_train))
y_test_pred = np.round(model.predict(X_test))

print('Data 3\n')
print('Training Data:\n', classification_report(y_train, y_train_pred))
print('Testing Data:\n', classification_report(y_test, y_test_pred))

Data 3

Training Data:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      7961
           1       0.00      0.00      0.00       395

    accuracy                           0.95      8356
   macro avg       0.48      0.50      0.49      8356
weighted avg       0.91      0.95      0.93      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      1993
           1       0.00      0.00      0.00        96

    accuracy                           0.95      2089
   macro avg       0.48      0.50      0.49      2089
weighted avg       0.91      0.95      0.93      2089



## Logistics Regression with 50 attributes

In [57]:
# Assign target and predictor
y = df3_c1['class']
X = df3_c1.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

X_train shape =  (8356, 50)
y_train shape =  (8356,)
X_test shape =  (2089, 50)
y_test shape =  (2089,)


In [60]:
#Resample

smote = SMOTE()
X_train_rs, y_train_rs = smote.fit_resample(X_train, y_train)

print('Original training data class distribution:')
print(pd.Series(y_train).value_counts())

print('Synthetic training data class distribution:')
print(pd.Series(y_train_rs).value_counts())

Original training data class distribution:
0    7961
1     395
dtype: int64
Synthetic training data class distribution:
1    7961
0    7961
dtype: int64


In [65]:
## LogisticRegression

logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_rs, y_train_rs)

print('Data3, Resampled:\n')
print('Training Data:\n', classification_report(y_train_rs, logreg.predict(X_train_rs)))
print('Testing Data:\n', classification_report(y_test, logreg.predict(X_test)))

Data3, Resampled:

Training Data:
               precision    recall  f1-score   support

           0       0.76      0.65      0.70      7961
           1       0.70      0.80      0.74      7961

    accuracy                           0.72     15922
   macro avg       0.73      0.72      0.72     15922
weighted avg       0.73      0.72      0.72     15922

Testing Data:
               precision    recall  f1-score   support

           0       0.98      0.66      0.79      1993
           1       0.08      0.66      0.15        96

    accuracy                           0.66      2089
   macro avg       0.53      0.66      0.47      2089
weighted avg       0.93      0.66      0.76      2089



## XGBoost  Default Model with 50 attributes

In [64]:
#XGBoost, Not Resampled
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)

print('Data3, NOT Resampled:\n')
print('Training Data:\n', classification_report(y_train, xgb.predict(X_train)))
print('Testing Data:\n', classification_report(y_test, xgb.predict(X_test)))

Data3, NOT Resampled:

Training Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7961
           1       1.00      1.00      1.00       395

    accuracy                           1.00      8356
   macro avg       1.00      1.00      1.00      8356
weighted avg       1.00      1.00      1.00      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1993
           1       0.69      0.11      0.20        96

    accuracy                           0.96      2089
   macro avg       0.82      0.56      0.59      2089
weighted avg       0.95      0.96      0.94      2089



In [66]:
#XGBoost, esampled

xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train_rs, y_train_rs)

print('Data3, Resampled:\n')
print('Training Data:\n', classification_report(y_train_rs, xgb.predict(X_train_rs)))
print('Testing Data:\n', classification_report(y_test, xgb.predict(X_test)))

Data3, Resampled:

Training Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7961
           1       1.00      1.00      1.00      7961

    accuracy                           1.00     15922
   macro avg       1.00      1.00      1.00     15922
weighted avg       1.00      1.00      1.00     15922

Testing Data:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1993
           1       0.37      0.38      0.37        96

    accuracy                           0.94      2089
   macro avg       0.67      0.67      0.67      2089
weighted avg       0.94      0.94      0.94      2089



## Random Forest with 50 Attributes

In [68]:
# RandomForestClassifier, Resamapled

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_rs, y_train_rs) 

print('Training Data:\n', classification_report(y_train_rs, rf.predict(X_train_rs)))
print('Testing Data:\n', classification_report(y_test, rf.predict(X_test)))

Training Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7961
           1       1.00      1.00      1.00      7961

    accuracy                           1.00     15922
   macro avg       1.00      1.00      1.00     15922
weighted avg       1.00      1.00      1.00     15922

Testing Data:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      1993
           1       0.17      0.18      0.17        96

    accuracy                           0.92      2089
   macro avg       0.57      0.57      0.57      2089
weighted avg       0.92      0.92      0.92      2089

