In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import svm

In [2]:
df = pd.read_csv('../data/CoilData.csv')
coils = pd.read_csv('../data/output.csv')
coil_list = list(map(int,list(coils.columns)))
lst = []
for i in df['coil']:
    if i in coil_list:
        lst.append(1)
    else:
        lst.append(0)
df['contracted'] = lst
df['analyse_main'] = [i[0:3] for i in df['analyse']]
dummies_analyse_main = pd.get_dummies(df['analyse_main'], dtype=float)
df = df.drop(columns=['coil', 'analyse', 'analyse_main'])
data = df
# data = df.join(dummies_analyse_main)

In [3]:
data['Thickness profile'] = data['Thickness profile'].apply(lambda x: x.replace('*******', ''))
data = data.replace('', np.nan, regex=True).dropna().astype(float)

## Data selection and partitioning

In [4]:
# Making balanced datasets
len_coil_list = len(coil_list)
df_good_coils = data[data.contracted == 0].sample(len_coil_list)
df_bad_coils = data[data.contracted == 1]

# concat even number of good and bad coils in df and reshuffle the dataframe so we randomize the the data when we
# split it. 
df_balanced_coils = pd.concat([df_good_coils, df_bad_coils]).sample(frac=1).reset_index(drop=True)
print(df_balanced_coils.head().T)

X = df_balanced_coils.iloc[:,:-1]
y = df_balanced_coils.contracted
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                                            0         1         2         3  \
furnace Number                         1.0000      1.00      1.00      1.00   
Hardness_1                         11815.0000  10081.00  12052.00  12431.00   
Hardness_2                           123.0000    101.00    125.00    111.00   
Width                               1601.8000   1428.50    981.00   1755.20   
Temperature before finishing mill   1181.0000   1188.00   1184.00   1195.00   
Temperature after finishing mill     903.0000    865.00    908.00    942.00   
Thickness                              5.1600      1.55      4.16      3.58   
Thickness profile                     25.0092     25.00     20.00     30.00   
c                                    720.0000    500.00    715.00     27.00   
mn                                  5474.0000   1908.00   5490.00   5120.00   
si                                    75.0000     84.00     97.00     55.00   
nb                                   306.0000      0

## Baseline modeling

In [5]:
pipe_lr = Pipeline([('scaler', StandardScaler()),
                    ('clf', LogisticRegression(random_state=42))])

pipe_dt = Pipeline([('scaler', StandardScaler()),
                    ('decsT', DecisionTreeClassifier())])

pipe_rf = Pipeline([('scaler', StandardScaler()),
                    ('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scaler', StandardScaler()),
                     ('clf', svm.SVC(random_state=42))])

In [6]:
# Set grid search params
param_range = [9, 10]
param_range_fl = [1.0, 0.5]
max_depth_range = np.arange(3,15)
min_samples_leaf_range = ...

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 

grid_params_dt = [{'decsT__max_depth': ['gini', 'entropy'],
                  'decsT__max_depth': max_depth_range}]

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

In [7]:
LR = GridSearchCV(estimator=pipe_lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10)

DT = GridSearchCV(estimator=pipe_dt,
                 param_grid=grid_params_dt,
                 scoring='accuracy')

RF = GridSearchCV(estimator=pipe_rf,
                 param_grid=grid_params_rf,
                 scoring='accuracy',
                 cv=10)

SVM = GridSearchCV(estimator=pipe_svm,
                  param_grid=grid_params_svm,
                  scoring='accuracy',
                  cv=10)

In [8]:
grids = [LR, DT, RF, SVM]

# Creating a dict for our reference
grid_dict = {0: 'Logistic Regression',
            1: 'Decision Tree Classifier',
            2: 'Random Forest Classifier',
            3: 'Support Vector Machine'}

In [9]:
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_grid_search_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: Logistic Regression
Best params are : {'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
Best training accuracy: 0.778
Test set accuracy score for best params: 0.785 

Estimator: Decision Tree Classifier
Best params are : {'decsT__max_depth': 8}
Best training accuracy: 0.788
Test set accuracy score for best params: 0.791 

Estimator: Random Forest Classifier
Best params are : {'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 9}
Best training accuracy: 0.824
Test set accuracy score for best params: 0.820 

Estimator: Support Vector Machine
Best params are : {'clf__C': 9, 'clf__kernel': 'rbf'}
Best training accuracy: 0.813
Test set accuracy score for best params: 0.819 

Classifier with best test set accuracy: Random Forest Classifier

Saved Random Forest Classifier grid search pipeline to file: best_grid_search_pipeline.pkl
