# Final Model with No missing values

In [1]:
# Import base libraries
import pandas as pd
import numpy as np
from scipy.io import arff

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.utils import class_weight

#from functions import *

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load data
data3 = arff.loadarff('../data/3year.arff')
df3 = pd.DataFrame(data3[0])

# Change label/class type to binary
df3['class'] = df3['class'].astype('int64')

df3.shape

(10503, 65)

## Remove 14 Attributes

In [10]:
# Clean, remove 14 rows

columns_to_delete = ['Attr37', 'Attr21', 'Attr27', 'Attr60', 'Attr45', 'Attr54', 'Attr64', 
                     'Attr53', 'Attr28', 'Attr24', 'Attr41', 'Attr32', 'Attr52', 'Attr47']

df3_c1 = df3.drop(columns_to_delete, axis=1)
df3_c1 = df3_c1[df3_c1.isnull().sum(axis=1) < 4]
df3_c1 = df3_c1.fillna(df3_c1.median())

In [23]:
df3_c1.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr51,Attr55,Attr56,Attr57,Attr58,Attr59,Attr61,Attr62,Attr63,class
0,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,0.46359,...,0.41299,127280.0,0.16396,0.37574,0.83604,7e-06,6.2813,84.291,4.3303,0
1,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,0.53962,...,0.44849,3387.8,0.027516,0.271,0.90108,0.0,4.1103,102.19,3.5716,0
2,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,0.67566,...,0.22612,20453.0,0.007639,0.000881,0.99236,0.0,3.7922,64.846,5.6287,0
3,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,0.56453,...,0.35164,5012.6,0.048398,0.043445,0.9516,0.14298,5.0528,98.783,3.695,0
4,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,0.58496,...,0.36891,13730.0,0.17648,0.32188,0.82635,0.073039,7.0756,100.54,3.6303,0


In [16]:
# Assign target and predictor
y = df3_c1['class']
X = df3_c1.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

X_train shape =  (8356, 50)
y_train shape =  (8356,)
X_test shape =  (2089, 50)
y_test shape =  (2089,)


In [17]:
#Model 7:

xgbParams = {
    'eval_metric': 'logloss', 
    'random_state': 42,
    'scale_pos_weight': 20,
    'n_estimators': 125, 
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 0,
    'learning_rate': 0.20,
    'max_delta_step': 0,
    'reg_lambda': 0,
    'reg_alpha': 5,
    'subsample': 1,
    'colsample_bytree': 0.7
}

In [18]:
#Run

weigths_train = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

clf = XGBClassifier(**xgbParams) 
clf.fit(X_train, y_train, sample_weight=weigths_train) 
    
print('Training Data:\n', classification_report(y_train, clf.predict(X_train)))
print('Testing Data:\n', classification_report(y_test, clf.predict(X_test)))


Training Data:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98      7961
           1       0.51      1.00      0.68       395

    accuracy                           0.96      8356
   macro avg       0.76      0.98      0.83      8356
weighted avg       0.98      0.96      0.96      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.97      0.92      0.94      1993
           1       0.22      0.50      0.31        96

    accuracy                           0.90      2089
   macro avg       0.60      0.71      0.63      2089
weighted avg       0.94      0.90      0.92      2089



## Remove 16 Attributes

In [20]:
# Clean, remove 16 rows

columns_to_delete = ['Attr37', 'Attr21', 'Attr27', 'Attr60', 'Attr45', 'Attr54', 'Attr64', 
                     'Attr53', 'Attr28', 'Attr24', 'Attr41', 'Attr32', 'Attr52', 'Attr47',
                    'Attr5', 'Attr61']

df3_c2 = df3.drop(columns_to_delete, axis=1)
df3_c2 = df3_c2[df3_c2.isnull().sum(axis=1) < 4]
df3_c2 = df3_c2.fillna(df3_c2.median())

In [24]:
df3_c2.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr6,Attr7,Attr8,Attr9,Attr10,Attr11,...,Attr50,Attr51,Attr55,Attr56,Attr57,Attr58,Attr59,Attr62,Attr63,class
0,0.17419,0.41299,0.14371,1.348,0.60383,0.21946,1.1225,1.1961,0.46359,0.21946,...,1.348,0.41299,127280.0,0.16396,0.37574,0.83604,7e-06,84.291,4.3303,0
1,0.14624,0.46038,0.2823,1.6294,0.0,0.17185,1.1721,1.6018,0.53962,0.17579,...,1.5874,0.44849,3387.8,0.027516,0.271,0.90108,0.0,102.19,3.5716,0
2,0.000595,0.22612,0.48839,3.1599,0.19114,0.004572,2.9881,1.0077,0.67566,0.004572,...,3.1599,0.22612,20453.0,0.007639,0.000881,0.99236,0.0,64.846,5.6287,0
3,0.024526,0.43236,0.27546,1.7833,0.56944,0.024526,1.3057,1.0509,0.56453,0.024526,...,1.4504,0.35164,5012.6,0.048398,0.043445,0.9516,0.14298,98.783,3.695,0
4,0.18829,0.41504,0.34231,1.9279,0.0,0.23358,1.4094,1.3393,0.58496,0.23881,...,1.7136,0.36891,13730.0,0.17648,0.32188,0.82635,0.073039,100.54,3.6303,0


In [21]:
# Assign target and predictor
y = df3_c2['class']
X = df3_c2.drop('class', axis=1)

# Sepearate data into train and test splist
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale/Normalize the predictor variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X_train shape = ', X_train.shape)
print('y_train shape = ', y_train.shape)
print('X_test shape = ', X_test.shape)
print('y_test shape = ', y_test.shape)

X_train shape =  (8356, 48)
y_train shape =  (8356,)
X_test shape =  (2089, 48)
y_test shape =  (2089,)


In [25]:
#Run

weigths_train = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

clf = XGBClassifier(**xgbParams) 
clf.fit(X_train, y_train, sample_weight=weigths_train) 
    
print('Training Data:\n', classification_report(y_train, clf.predict(X_train)))
print('Testing Data:\n', classification_report(y_test, clf.predict(X_test)))


Training Data:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      7961
           1       0.50      1.00      0.67       395

    accuracy                           0.95      8356
   macro avg       0.75      0.98      0.82      8356
weighted avg       0.98      0.95      0.96      8356

Testing Data:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94      1993
           1       0.20      0.51      0.29        96

    accuracy                           0.88      2089
   macro avg       0.59      0.71      0.61      2089
weighted avg       0.94      0.88      0.91      2089

