# Binary Classification of Machine Failures

optimizing ROC AUC instead of PR AUC

## Index <a id="index"></a>

- [Libraries](#libraries)
- [Import Data](#import_data)
- [Data Cleaning](#data_cleaning)
- [Exploratory Data Analysis](#eda)
- [Preprocessing](#preprocessing)
- [PCA Analysis](#pca)
- [Model](#model)
- [Save Predictions](#savepreds)

## Libraries <a id="libraries"></a>

[Return to Index](#index)

In [2]:
import os

In [3]:
if 'source' not in os.listdir():
    os.chdir('..')

In [4]:
import pandas as pd
import numpy as np

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import ClassifierChain

In [6]:
import lightgbm as lgb

## Import Data <a id="import_data"></a>

[Return to Index](#index)

In [7]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [8]:
train_data.iloc[:, :20]

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,EState_VSA2,ExactMolWt,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,HallKierAlpha,HeavyAtomMolWt,Kappa3,MaxAbsEStateIndex,MinEStateIndex
0,0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.000000,11.938294,222.068080,1.181818,1.727273,2.363636,-0.24,212.163,8.170000,11.922504,0.171585
1,1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,0.000000,260.029719,1.346154,2.076923,2.769231,-0.09,247.031,3.201491,10.932338,-4.830450
2,2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,6.606882,382.131027,1.085714,1.742857,2.400000,-0.78,354.106,15.033890,11.238048,-5.066255
3,3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,0.000000,530.070277,1.162791,1.573770,2.270270,-1.30,506.124,6.724301,11.171170,-5.276575
4,4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.036450,0.727664,17.980451,12.841643,118.062994,1.444444,2.111111,2.555556,-1.10,108.056,3.931272,9.855741,-1.676296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14833,14833,632.207041,10.911303,6.579933,9.179964,4.653583,6.030052,3.670528,1.770579,32.971529,6.606882,347.063084,1.562500,2.187500,2.687500,-1.87,333.112,4.879408,11.426427,-5.077909
14834,14834,62.568425,2.642734,1.446898,1.446898,0.879497,0.879497,0.174620,0.000000,0.000000,0.000000,74.024203,2.666667,2.666667,2.666667,-0.53,70.027,0.729375,5.506944,0.250000
14835,14835,981.327476,10.363081,6.146219,6.146219,4.700576,4.700576,3.064846,2.133897,17.248535,0.000000,297.089560,1.461538,2.153846,2.769231,-3.15,282.220,2.045502,12.118907,-0.446105
14836,14836,299.171248,9.949161,6.589761,7.848913,5.276568,5.476436,3.978973,2.299833,45.623794,0.000000,265.959270,0.950000,1.400000,1.950000,-0.44,257.119,8.424399,10.909292,-4.181527


In [9]:
train_data.iloc[:, 20:]

Unnamed: 0,NumHeteroatoms,PEOE_VSA10,PEOE_VSA14,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2,EC3,EC4,EC5,EC6
0,4,0.000000,91.536492,0.000000,0.000000,0.000000,17.744066,0.000000,4.794537,35.527357,0,0,1,1,0,0,0,0
1,10,24.415866,7.822697,0.000000,0.000000,0.000000,7.822697,30.705892,13.825658,44.707310,0,0,0,1,1,0,0,0
2,9,0.000000,15.645394,0.000000,53.378235,0.000000,15.645394,73.143616,17.964475,45.660120,0,0,1,1,0,0,1,0
3,19,42.727765,21.335138,0.000000,0.000000,6.420822,15.645394,62.107304,31.961948,87.509997,0,0,1,1,0,0,0,0
4,4,6.041841,11.938611,6.923737,19.386400,0.000000,11.938611,18.883484,9.589074,33.333333,2,2,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14833,16,17.248535,15.645394,0.000000,13.847474,6.923737,34.407699,32.607024,18.947452,61.376610,0,0,1,1,0,0,0,0
14834,0,0.000000,0.000000,0.000000,0.000000,6.066367,0.000000,6.420822,0.000000,10.000000,0,0,0,1,0,1,0,0
14835,8,34.480943,0.000000,0.000000,0.000000,23.762553,10.969244,0.000000,0.000000,66.666667,0,0,1,1,0,0,0,0
14836,7,0.000000,7.822697,19.420579,0.000000,0.000000,7.822697,108.961047,9.088795,45.583333,0,0,0,1,1,0,0,0


In [10]:
np.unique(train_data['NumHeteroatoms'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 39, 42], dtype=int64)

In [11]:
np.unique(train_data['fr_COO2'])

array([0, 1, 2, 3, 4, 5, 6, 8], dtype=int64)

In [12]:
np.unique(train_data['fr_COO2'])

array([0, 1, 2, 3, 4, 5, 6, 8], dtype=int64)

## Data Cleaning <a id="data_cleaning"></a>

[Return to Index](#index)

## Exploratory Data Analysis <a id="eda"></a>

[Return to Index](#index)

Define numerical and categorical columns

## Preprocessing <a id="preprocessing"></a>

[Return to Index](#index)

Selection of final columns

In [13]:
train_data.columns

Index(['id', 'BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
       'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6'],
      dtype='object')

In [14]:
target_vars = ['EC1', 'EC2']
exclude_features = ['id', 'EC3', 'EC4', 'EC5', 'EC6', 'y']
categorical_features = ['fr_COO', 'fr_COO2']
numerical_features = [col for col in train_data.columns if col not in target_vars + exclude_features + categorical_features]

In [15]:
train_data['y'] = train_data.apply(
    lambda x:
    '-'.join([str(int(x[target_var])) for target_var in target_vars]),
    axis = 1
)

### Train-Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    train_data[numerical_features + categorical_features],
    train_data['y'],
    random_state = 23,
    stratify = train_data['y']
)

## Model <a id="model"></a>

[Return to Index](#index)

CV Split

In [17]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 23)

### LightGBM <a id="lightgbm"></a>

In [18]:
lgbm_grid = [
    {
        'lgbm__max_depth': [-1],
        'lgbm__num_leaves': [50, 75, 100],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.1],
        'lgbm__n_estimators': [50, 70, 100]
    },
    {
        'lgbm__max_depth': [3],
        'lgbm__num_leaves': [10, 20, 30],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.1],
        'lgbm__n_estimators': [50, 70, 100]
    },
    {
        'lgbm__max_depth': [5],
        'lgbm__num_leaves': [20, 30, 40],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.1],
        'lgbm__n_estimators': [50, 70, 100]
    },
    {
        'lgbm__max_depth': [7],
        'lgbm__num_leaves': [30, 40, 50],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.1],
        'lgbm__n_estimators': [50, 70, 100]
    },
    {
        'lgbm__max_depth': [-1],
        'lgbm__num_leaves': [50, 75, 100],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.05],
        'lgbm__n_estimators': [70, 100, 120]
    },
    {
        'lgbm__max_depth': [3],
        'lgbm__num_leaves': [10, 20, 30],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.05],
        'lgbm__n_estimators': [70, 100, 120]
    },
    {
        'lgbm__max_depth': [5],
        'lgbm__num_leaves': [20, 30, 40],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.05],
        'lgbm__n_estimators': [70, 100, 120]
    },
    {
        'lgbm__max_depth': [7],
        'lgbm__num_leaves': [30, 40, 50],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.05],
        'lgbm__n_estimators': [70, 100, 120]
    },
    {
        'lgbm__max_depth': [-1],
        'lgbm__num_leaves': [50, 75, 100],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.1],
        'lgbm__n_estimators': [50, 70, 100]
    },
    {
        'lgbm__max_depth': [3],
        'lgbm__num_leaves': [10, 20, 30],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.01],
        'lgbm__n_estimators': [100, 120, 150]
    },
    {
        'lgbm__max_depth': [5],
        'lgbm__num_leaves': [20, 30, 40],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.01],
        'lgbm__n_estimators': [100, 120, 150]
    },
    {
        'lgbm__max_depth': [7],
        'lgbm__num_leaves': [30, 40, 50],
        'lgbm__subsample': [0.5, 0.7, 1],
        'lgbm__colsample_bytree': [0.6, 0.8, 1],
        'lgbm__max_bin': [150, 200, 255],
        'lgbm__learning_rate': [0.01],
        'lgbm__n_estimators': [100, 120, 150]
    },
]

In [19]:
lgbm_estimator = Pipeline(
    [
        (
            'column_transformer',
            ColumnTransformer(
                [
                    (
                        'numeric',
                        Pipeline(
                            [
                                ('impute', SimpleImputer(strategy = 'mean')),
                                ('scale', StandardScaler())
                            ]
                        ),
                        numerical_features
                    ),
                    (
                        'categoric',
                        Pipeline(
                            [
                                ('impute', SimpleImputer(strategy = 'most_frequent')),
                                (
                                    'encode', OrdinalEncoder(
                                        handle_unknown = 'use_encoded_value',
                                        unknown_value = -1
                                    )
                                )
                            ]
                        ),
                        categorical_features
                    )
                ]
            )
        ),
        (
            'lgbm',
            lgb.LGBMClassifier(random_state = 23, is_unbalance = True)
        )
    ]
)

In [24]:
lgbm = RandomizedSearchCV(
    lgbm_estimator,
    n_iter = 50,
    param_distributions = lgbm_grid,
    scoring = 'roc_auc_ovr',
    cv = cv,
    random_state = 23,
    refit = True,
    n_jobs = -1
)

In [25]:
lgbm.fit(X_train, y_train)

In [26]:
lgbm.best_params_

{'lgbm__subsample': 0.7,
 'lgbm__num_leaves': 20,
 'lgbm__n_estimators': 70,
 'lgbm__max_depth': 3,
 'lgbm__max_bin': 255,
 'lgbm__learning_rate': 0.05,
 'lgbm__colsample_bytree': 0.6}

### Final Model <a id="finalmodel"></a>

[Back to Model](#model)

### Evaluate Model <a id="evaluate"></a>

[Back to Model](#model)

### Save Predictions <a id="savepreds"></a>

[Back to Model](#model)

In [33]:
final_model = lgbm.best_estimator_

In [34]:
test_data

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,PEOE_VSA14,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2
0,14838,344.632371,7.283603,4.473966,5.834958,3.412257,4.651530,2.096558,1.116433,49.458581,...,13.512441,0.000000,0.000000,0.000000,26.809272,24.539800,4.794537,47.304082,1,1
1,14839,1432.410201,10.663869,7.079026,8.065215,5.297097,5.297097,3.924155,2.569694,0.000000,...,0.000000,34.947374,98.323987,9.606882,0.000000,53.378235,0.000000,43.166667,0,0
2,14840,83.352608,3.931852,1.774215,1.774215,1.073446,1.073446,0.467830,0.170838,5.969305,...,5.969305,0.000000,0.000000,6.420822,11.752550,13.344559,9.589074,24.666667,1,1
3,14841,150.255712,5.912790,3.548812,3.548812,2.595128,2.595128,1.642813,0.694113,0.000000,...,59.935299,0.000000,0.000000,0.000000,17.744066,32.290168,4.794537,26.778866,0,0
4,14842,1817.276351,24.910940,15.540529,20.047314,12.535886,17.730988,11.979618,4.431173,84.554972,...,23.468091,25.609359,0.000000,37.099000,69.141353,38.704130,50.697492,102.583333,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9888,24726,246.422865,4.036581,2.816709,2.816709,1.875634,1.875634,1.235986,0.362743,24.146543,...,11.938611,0.000000,0.000000,0.000000,11.938611,12.207933,9.589074,30.000000,2,2
9889,24727,591.069706,8.770857,5.682461,5.682461,4.050440,4.050440,2.167855,1.770579,0.000000,...,0.000000,18.199101,37.107112,17.696186,10.969244,0.000000,0.000000,53.166667,0,0
9890,24728,378.113435,6.310349,3.402334,4.317724,2.817428,4.071978,1.970236,1.165747,36.705949,...,7.822697,0.000000,0.000000,0.000000,7.822697,24.099010,4.736863,50.652870,0,0
9891,24729,737.653518,9.949161,7.337949,7.337949,4.428511,5.948361,3.972459,2.160881,36.992053,...,0.000000,0.000000,0.000000,6.196844,0.000000,12.462662,9.589074,50.250000,0,0


In [35]:
X = test_data[numerical_features + categorical_features]

In [36]:
y = model.predict_proba(X)

In [37]:
prediction = pd.DataFrame(
    {
        'id': test_data['id'],
        'EC1': y[:, 2] + y[:, 3],
        'EC2': y[:, 1] + y[:, 3]
    }
)

In [38]:
prediction.to_csv('predictions/prediction.csv', index = False)