In [1]:
import pandas as pd
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
target = pd.read_csv('../data/new_target.csv').drop(columns = 'Unnamed: 0')
predictors = pd.read_csv('../data/new_predictors.csv').drop(columns = 'Unnamed: 0')

In [3]:
trucks = target[['EquipmentID']].drop_duplicates()

derates = target.loc[target['target'] == 1]['EquipmentID'].drop_duplicates().to_list()

trucks['derate'] = np.where(
    trucks['EquipmentID'].isin(derates), 1, 0
)

xTrain, xTest, yTrain, yTest = train_test_split(
    trucks['EquipmentID'], trucks['EquipmentID'], test_size = .2, random_state = 777, stratify = trucks['derate']
)

In [4]:
xTrain = (predictors.loc[predictors['EquipmentID'].isin(xTrain)]
          .drop(columns = ['EquipmentID'])
          .reset_index(drop = True)
         )
xTest = (predictors.loc[predictors['EquipmentID'].isin(xTest)]
          .drop(columns = ['EquipmentID'])
          .reset_index(drop = True)
         )
yTrain = target.loc[target['EquipmentID'].isin(yTrain)].reset_index(drop = True)
yTest = target.loc[target['EquipmentID'].isin(yTest)].reset_index(drop = True)

In [5]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [6]:
oversample = RandomOverSampler(sampling_strategy=1)

xTrain_over, yTrain_over = oversample.fit_resample(
    xTrain, 
    yTrain['target'] == 1
)

In [7]:
pipeline = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('nn', MLPClassifier(activation = 'tanh', hidden_layer_sizes = (100, 100, 100), alpha = 1, max_iter = 10000))
    ]
)

In [8]:
pipeline.fit(xTrain_over, yTrain_over)

In [9]:
conf_mat = confusion_matrix(yTest['target'] == 1, pipeline.predict(xTest))
print("Confusion matrix:\n", conf_mat)

Confusion matrix:
 [[111242   1643]
 [    58     73]]


In [10]:
yTest['prediction'] = pipeline.predict_proba(xTest)[:, 1]

yTrain['prediction'] = pipeline.predict_proba(xTrain)[:, 1]

In [15]:
# yTest.to_csv('../data/hg_yTest_full.csv', index = False)
# yTrain.to_csv('../data/hg_yTrain_full.csv', index = False)