In [1]:
# https://nbviewer.jupyter.org/github/alexander-de-leeuw/innoplexus-online-hackathon/blob/master/Innoplexus%20-%20online%20hackathon.ipynb?flush_cache=true

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import svm

In [2]:
df = pd.read_csv('../data/CoilData.csv')
coils = pd.read_csv('../data/output.csv')
coil_list = list(map(int,list(coils.columns)))
lst = []
for i in df['coil']:
    if i in coil_list:
        lst.append(1)
    else:
        lst.append(0)
df['contracted'] = lst
df['analyse_main'] = [i[0:3] for i in df['analyse']]
dummies_analyse_main = pd.get_dummies(df['analyse_main'], dtype=float)
df = df.drop(columns=['coil', 'analyse', 'analyse_main'])
data = df
# data = df.join(dummies_analyse_main)

In [3]:
data['Thickness profile'] = data['Thickness profile'].apply(lambda x: x.replace('*******', ''))
data = data.replace('', np.nan, regex=True).dropna().astype(float)

## Data selection and partitioning

In [4]:
# Making balanced datasets
len_coil_list = len(coil_list)
df_good_coils = data[data.contracted == 0].sample(len_coil_list)
df_bad_coils = data[data.contracted == 1]

# concat even number of good and bad coils in df and reshuffle the dataframe so we randomize the the data when we
# split it. 
df_balanced_coils = pd.concat([df_good_coils, df_bad_coils]).sample(frac=1).reset_index(drop=True)
print(df_balanced_coils.head().T)

X = df_balanced_coils.iloc[:,:-1]
y = df_balanced_coils.contracted
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                                         0         1         2        3  \
furnace Number                        1.00      1.00      4.00      1.0   
Hardness_1                         9771.00  12833.00  12406.00  11963.0   
Hardness_2                           98.00    134.00    129.00    106.0   
Width                               943.50    807.40   1497.90    913.2   
Temperature before finishing mill  1149.00   1180.00   1154.00   1154.0   
Temperature after finishing mill    936.00    863.00    906.00    952.0   
Thickness                             3.78      3.02      3.82      2.8   
Thickness profile                    17.00     28.00     40.00     28.0   
c                                   356.00    665.00    676.00     18.0   
mn                                 2034.00  14040.00   8772.00   7009.0   
si                                  125.00   1328.00    133.00    699.0   
nb                                    0.00    249.00    431.00     32.0   
p                        

## Baseline modeling

In [5]:
pipe_lr = Pipeline([('scaler', StandardScaler()),
                    ('clf', LogisticRegression(random_state=42))])

pipe_dt = Pipeline([('scaler', StandardScaler()),
                    ('decsT', DecisionTreeClassifier())])

pipe_rf = Pipeline([('scaler', StandardScaler()),
                    ('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scaler', StandardScaler()),
                     ('clf', svm.SVC(random_state=42))])

In [6]:
# Set grid search params
param_range = [9, 10]
param_range_fl = [1.0, 0.5]
max_depth_range = np.arange(3,15)
min_samples_leaf_range = ...

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 

grid_params_dt = [{'decsT__max_depth': ['gini', 'entropy'],
                  'decsT__max_depth': max_depth_range}]

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

In [7]:
LR = GridSearchCV(estimator=pipe_lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10)

DT = GridSearchCV(estimator=pipe_dt,
                 param_grid=grid_params_dt,
                 scoring='accuracy')

RF = GridSearchCV(estimator=pipe_rf,
                 param_grid=grid_params_rf,
                 scoring='accuracy',
                 cv=10)

SVM = GridSearchCV(estimator=pipe_svm,
                  param_grid=grid_params_svm,
                  scoring='accuracy',
                  cv=10)

In [8]:
grids = [LR, DT, RF, SVM]

# Creating a dict for our reference
grid_dict = {0: 'Logistic Regression',
            1: 'Decision Tree Classifier',
            2: 'Random Forest Classifier',
            3: 'Support Vector Machine'}

In [None]:
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_grid_search_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: Logistic Regression
Best params are : {'clf__C': 0.5, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best training accuracy: 0.784
Test set accuracy score for best params: 0.763 

Estimator: Decision Tree Classifier
Best params are : {'decsT__max_depth': 9}
Best training accuracy: 0.795
Test set accuracy score for best params: 0.783 

Estimator: Random Forest Classifier
Best params are : {'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 10}
Best training accuracy: 0.830
Test set accuracy score for best params: 0.806 

Estimator: Support Vector Machine


In [None]:
# # # Train model
# pipe = Pipeline(['scaler', StandardScaler(),
#        'clf', LogisticRegression()])

# pipe.fit(x_train, y_train)
# print(pipe.score(x_test, y_test))

# # Make predictions
# y_train_pred_proba = pipe.predict_proba(x_train)[:,1]
# y_test_pred_proba = pipe.predict_proba(x_test)[:,1]

In [None]:
# # Observe general model performance
# print('train roc auc: {:.2f}'.format(roc_auc_score(y_train, y_train_pred_proba)))
# print('test roc auc:   {:.2f}'.format(roc_auc_score(y_test, y_test_pred_proba)))

In [None]:
# # Observe general model performance
# y_train_pred = y_train_pred_proba > 0.5
# y_test_pred = y_test_pred_proba > 0.5

# print('train f1:      {:.2f}'.format(f1_score(y_train, y_train_pred)))
# print('test f1:        {:.2f}'.format(f1_score(y_test, y_test_pred)))