In [1]:
import pandas as pd
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
target = pd.read_csv('../data/target.csv')
predictors = pd.read_csv('../data/predictors.csv')

**Creating a train test split on the truck id instead of the full data set with a stratify on derates**

In [4]:
trucks = target[['EquipmentID']].drop_duplicates()

derates = target.loc[target['target'] == 1]['EquipmentID'].drop_duplicates().to_list()

trucks['derate'] = np.where(
    trucks['EquipmentID'].isin(derates), 1, 0
)

xTrain, xTest, yTrain, yTest = train_test_split(
    trucks['EquipmentID'], trucks['EquipmentID'], test_size = .2, random_state = 777, stratify = trucks['derate']
)

In [5]:
xTrain = (predictors.loc[predictors['EquipmentID'].isin(xTrain)]
          .drop(columns = ['EquipmentID', 'EventTimeStamp'])
          .reset_index(drop = True)
         )
xTest = (predictors.loc[predictors['EquipmentID'].isin(xTest)]
          .drop(columns = ['EquipmentID', 'EventTimeStamp'])
          .reset_index(drop = True)
         )
yTrain = target.loc[target['EquipmentID'].isin(yTrain)].reset_index(drop = True)
yTest = target.loc[target['EquipmentID'].isin(yTest)].reset_index(drop = True)

In [7]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

**Over sampling the data to help with the imbalance**

In [8]:
oversample = RandomOverSampler(sampling_strategy=0.5)

xTrain_over, yTrain_over = oversample.fit_resample(
    xTrain, 
    yTrain['target'] == 1
)

**Running a neural network classifier**

In [12]:
nn = MLPClassifier(activation = 'tanh', hidden_layer_sizes = (100, 100, 100), alpha = 1, max_iter = 10000)

nn.fit(xTrain_over, yTrain_over)

In [13]:
yTest['prediction'] = nn.predict_proba(xTest)[:, 1]

In [14]:
yTrain['prediction'] = nn.predict_proba(xTrain)[:, 1]

In [15]:
#yTrain.to_csv('../data/yTrain.csv', index = False)

In [16]:
#yTest.to_csv('../data/yTest.csv', index = False)

In [17]:
conf_mat = confusion_matrix(yTest['target'] == 1, nn.predict(xTest))
print("Confusion matrix:\n", conf_mat)

Confusion matrix:
 [[116943   1822]
 [    37     77]]
