In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
np.random.seed(0)

In [3]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [4]:
train

Unnamed: 0.1,Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,neg,76698,na,2130706438,280,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,1,neg,33058,na,0,na,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,2,neg,41040,na,228,100,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,3,neg,12,0,70,66,0,10,0,0,...,240,46,58,44,10,0,0,0,4,32
4,4,neg,60874,na,1368,458,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,59995,neg,153002,na,664,186,0,0,0,0,...,998500,566884,1290398,1218244,1019768,717762,898642,28588,0,0
59996,59996,neg,2286,na,2130706538,224,0,0,0,0,...,10578,6760,21126,68424,136,0,0,0,0,0
59997,59997,neg,112,0,2130706432,18,0,0,0,0,...,792,386,452,144,146,2622,0,0,0,0
59998,59998,neg,80292,na,2130706432,494,0,0,0,0,...,699352,222654,347378,225724,194440,165070,802280,388422,0,0


In [5]:
train.columns

Index(['Unnamed: 0', 'class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000',
       'af_000', 'ag_000', 'ag_001',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=172)

## Dataset description

Positive class: APS-related failures from Scania's fleet

Negative class: Non-APS failures within Scania's fleet

Features: "attributes" for one of the trucks in Scania's fleet

Type 1 and Type 2 errors constitute unnecessary maintenance. We have proposed a plan to perform a classification to predict whether an instance would lead to unnecessary maintenance or cost-effective maintenance.

To get Type 1 and Type 2 errors, we'd need to first train a classification model on half of our training and validation sets. With our best-validated model, we will then classify on half of our test set and obtain misclassified samples. These samples will then be labeled as "cost-ineffective" samples, and the remainder labled as "cost-effective" samples (negative/positive). This test set will then be partitioned into train/validation/test sets for the secondary classification model.

Our initial training set will combine both "train" and "test" sets, and will be partitioned as follows:

50% training set for model 1

10% validation set for model 1

40% "test" set for model 1 -> will be re-labeled and turn into the following:

40% training set for model 2

20% validation set for model 2

40% test set for model 2

## Model 1

### Data engineering

In [6]:
data = pd.concat([train, test])

In [7]:
data = data.sample(frac=1)

In [8]:
data = data.drop(columns = ["Unnamed: 0"])

In [9]:
data

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
15690,neg,6,0,4,4,0,0,0,0,0,...,34,8,68,20,0,0,0,0,0,0
9296,neg,39262,na,102,98,0,0,0,0,0,...,396070,231104,581880,664542,11834,98,22,0,0,0
16942,neg,30196,na,852,654,0,0,0,0,0,...,284384,143496,259540,275952,232080,111920,68550,10,0,0
42206,neg,75264,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2331,neg,61506,na,0,na,0,0,0,0,0,...,492906,250046,540578,584200,374944,242698,257940,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,neg,242248,na,0,na,0,0,0,0,0,...,1956790,863294,2064142,2149416,1703842,868296,1131488,386,0,0
45891,neg,12982,na,2130706446,2124,0,0,0,0,0,...,134090,55342,146006,106300,100960,15752,4112,0,0,0
42613,neg,562564,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43567,neg,1618,0,2130706432,128,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
data = data.replace("na", np.nan)

In [11]:
y = data["class"].tolist()

In [12]:
y_list = []

for label in y:
    if label == "pos":
        y_list.append(1)
    elif label == "neg":
        y_list.append(0)

y = pd.to_numeric(y_list)

In [13]:
X = data.drop(columns = ["class"])

In [14]:
for column_name in X.columns:
    X[column_name] = pd.to_numeric(X[column_name])

In [15]:
X_train_1 = X[:38000]
X_val_1 = X[38000:45600]
X_test_1 = X[45600:]

y_train_1 = y[:38000]
y_val_1 = y[38000:45600]
y_test_1 = y[45600:]

### Model 1 (XGBoost)

In [16]:
model_1_dmatrix = xgb.DMatrix(data = X_train_1, label = y_train_1)

In [17]:
model_1 = xgb.XGBClassifier(n_classifiers = 1000, scale_pos_weight = 0.7, max_depth=20, verbosity=3, colsample_bytree = .5, random_state=0)

In [18]:
model_1.fit(X_train_1, y_train_1)

[11:21:47] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned nodes, max_depth=16
[11:21:48] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 0 pruned nodes, max_depth=19
[11:21:48] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=11
[11:21:49] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 158 extra nodes, 0 pruned nodes, max_depth=14
[11:21:50] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=16
[11:21:51] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_d

[11:22:24] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=10
[11:22:25] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=12
[11:22:26] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_depth=11
[11:22:27] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_depth=11
[11:22:27] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=11
[11:22:28] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_d

[11:22:54] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=13


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=None, n_classifiers=1000,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=0.7, seed=None, silent=None,
              subsample=1, verbosity=3)

In [19]:
model_1_predictions = model_1.predict(X_val_1)

In [20]:
print(classification_report(y_val_1, model_1_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7461
           1       0.94      0.74      0.83       139

    accuracy                           0.99      7600
   macro avg       0.97      0.87      0.91      7600
weighted avg       0.99      0.99      0.99      7600



In [21]:
model_1_test_predictions = model_1.predict(X_test_1)

In [22]:
print(classification_report(y_test_1, model_1_test_predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     29823
           1       0.89      0.73      0.80       577

    accuracy                           0.99     30400
   macro avg       0.94      0.86      0.90     30400
weighted avg       0.99      0.99      0.99     30400



In [23]:
confusion_mtx = confusion_matrix(y_test_1, model_1_test_predictions).tolist()

print("Cost:", 10 * confusion_mtx[0][1] + 500 * confusion_mtx[1][0])

Cost: 78010


## Model 2

In [24]:
X_2 = X_test_1

In [25]:
y_2 = []

index = 0

for true_label in y_test_1:
    if true_label == model_1_test_predictions[index]:
        y_2.append(0)
    elif true_label != model_1_test_predictions[index]:
        y_2.append(1)
    else:
        print("error")
    index += 1
    
y_2 = pd.to_numeric(y_2)

In [26]:
len(X_2)

30400

In [27]:
X_2_train = X_2[:12160]
X_2_val = X_2[12160:18240]
X_2_test = X_2[18240:]

y_2_train = y_2[:12160]
y_2_val = y_2[12160:18240]
y_2_test = y_2[18240:]

### Model 2 (XGBoost)

In [28]:
model_2_dmatrix = xgb.DMatrix(data = X_2_train, label = y_2_train)

In [29]:
model_2 = xgb.XGBClassifier(n_classifiers = 1000, scale_pos_weight = 0.7, max_depth=20, verbosity=3, colsample_bytree = .5, random_state=0)

In [30]:
model_2.fit(X_2_train, y_2_train)

[11:22:55] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[11:22:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=10
[11:22:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=9
[11:22:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[11:22:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=7
[11:22:56] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=10
[1

[11:23:03] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=6
[11:23:03] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=8
[11:23:03] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=8
[11:23:03] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=9
[11:23:03] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=7
[11:23:04] INFO: C:\Jenkins\workspace\xgboost-win64_release_0.90\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=9
[11:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=None, n_classifiers=1000,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=0.7, seed=None, silent=None,
              subsample=1, verbosity=3)

In [31]:
model_2_predictions = model_2.predict(X_2_val)

In [32]:
print(classification_report(y_2_val, model_2_predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6041
           1       0.71      0.13      0.22        39

    accuracy                           0.99      6080
   macro avg       0.85      0.56      0.61      6080
weighted avg       0.99      0.99      0.99      6080



In [33]:
model_2_test_predictions = model_2.predict(X_2_test)

In [34]:
print(classification_report(y_2_test, model_2_test_predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     12084
           1       0.50      0.11      0.17        76

    accuracy                           0.99     12160
   macro avg       0.75      0.55      0.59     12160
weighted avg       0.99      0.99      0.99     12160

