In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

In [2]:
df = pd.read_csv('../data/CoilData.csv')
coils = pd.read_csv('../data/output.csv')
coil_list = list(map(int,list(coils.columns)))
lst = []
for i in df['coil']:
    if i in coil_list:
        lst.append(1)
    else:
        lst.append(0)
df['contracted'] = lst
df['analyse_main'] = [i[0:4] for i in df['analyse']]
dummies_analyse_main = pd.get_dummies(df['analyse_main'], dtype=float)
df = df.drop(columns=['coil', 'analyse', 'analyse_main', 'furnace Number', 'Temperature before finishing mill', 'Temperature after finishing mill'])
data = df.copy()

In [3]:
# replace string values in Thickness profile column
data['Thickness profile'] = data['Thickness profile'].apply(lambda x: x.replace('*******', ''))
data = data.replace('', np.nan, regex=True).dropna().astype(float)
data = data[data['Thickness profile'] >= 0]

In [4]:
# Check negative values in Thickness profile
# neg_val_thick_profile = len(data['Thickness profile'][data['Thickness profile'] < 0])
# print(neg_val_thick_profile)

In [5]:
# Standardize data
scaler = StandardScaler()
selection_standardize = data.iloc[:,0:5]
list_columns = selection_standardize.columns
scaled_selection = pd.DataFrame(data=scaler.fit_transform(selection_standardize), columns=[list_columns])
print(scaled_selection.head())

  Hardness_1 Hardness_2     Width Thickness Thickness profile
0  -0.864475  -0.593122  0.143819  0.748022          0.910067
1  -0.775493  -0.593122  0.075519  0.757278          1.273254
2  -0.837039  -0.533462  0.127606  0.812812          0.365286
3  -0.686510  -0.533462  0.120018  0.822067          0.637677
4  -0.857802  -0.652781  0.113463  0.368543          0.456083


In [8]:
# Transform to log data and replace -inf values with min value != 0 
# in each column and get the log of that number divided by 1000
log_selection = data.iloc[:,5:19]
for column in list(log_selection.columns):
    min_value_per_column = min(i for i in log_selection.loc[:,column] if i > 0)
    log_selection.loc[:,column] = np.log(log_selection.loc[:,column]).replace(-np.inf, np.log(min_value_per_column/1000))

In [9]:
# Join dataframes and adding OneHotEncoding for Categorical values of 'Analyse' column
data = log_selection.join(scaled_selection).join(dummies_analyse_main).join(df['contracted']).dropna()
# data = log_selection.join(dummies_analyse_main).join(df['contracted']).dropna()
print(data.head())

            c        mn        si        nb         p         s        al  \
513  0.107368  0.779721  0.635883  0.176186  0.631191  0.364448  0.590557   
605  0.670636  0.821765  0.735643 -0.311839  0.542217  0.019588  0.568902   
606  0.664946  0.820326  0.732402 -1.118914  0.532236  0.145198  0.573764   
607  0.664946  0.820326  0.732402 -1.118914  0.532236  0.145198  0.573764   
608  0.650477  0.827498  0.721197 -0.133998  0.491695  0.244001  0.566924   

           ma         b         n  ...  TB31  TB32  TB41  TB43  TB51  TB53  \
513  0.586263 -0.539228  0.133470  ...   0.0   0.0   0.0   0.0   0.0   0.0   
605  0.564894 -2.363952  0.281215  ...   0.0   0.0   0.0   0.0   0.0   0.0   
606  0.570284 -1.118914  0.244001  ...   0.0   0.0   0.0   0.0   0.0   0.0   
607  0.570284 -1.118914  0.244001  ...   0.0   0.0   0.0   0.0   0.0   0.0   
608  0.563110 -2.363952  0.310462  ...   0.0   0.0   0.0   0.0   0.0   0.0   

     TB61  TB63  TB71  contracted  
513   0.0   0.0   0.0           

## Data selection and partitioning

In [10]:
# Making balanced datasets
len_coil_list = len(coil_list)
df_good_coils = data[data.contracted == 0].sample(len_coil_list)
df_bad_coils = data[data.contracted == 1]

# concat even number of good and bad coils in df and reshuffle the dataframe so we randomize the the data when we
# split it. 
df_balanced_coils = pd.concat([df_good_coils, df_bad_coils]).sample(frac=1).reset_index(drop=True)

X = df_balanced_coils.iloc[:,:-1]
y = df_balanced_coils.contracted
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Baseline modeling

In [11]:
lr = LogisticRegression(random_state=42)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=42)
svm = svm.SVC(random_state=42)

In [12]:
# Set grid search params
max_depth_range = np.arange(3,15,3)
max_iter = np.arange(100,1000,100)

grid_params_lr = [{'multi_class': ['auto', 'ovr', 'multinomial'],
                  'max_iter': [1000,2000,3000]}]

grid_params_dt = [{'criterion': ['gini', 'entropy'],
                  'max_depth': max_depth_range}]

grid_params_rf = [{'criterion': ['gini', 'entropy'],
        'max_depth': max_depth_range,
        'min_samples_split': max_depth_range}]

grid_params_svm = [{'kernel': ['linear', 'rbf'], 
        'C': max_depth_range}]

In [13]:
LR = GridSearchCV(lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=3)

DT = GridSearchCV(dt,
                 param_grid=grid_params_dt,
                 scoring='accuracy',
                 cv=3)

RF = GridSearchCV(rf,
                 param_grid=grid_params_rf,
                 scoring='accuracy',
                 cv=3)

SVM = GridSearchCV(svm,
                  param_grid=grid_params_svm,
                  scoring='accuracy',
                  cv=3)

In [14]:
grids = [LR, DT, RF, SVM]

# Creating a dict for our reference
grid_dict = {0: 'Logistic Regression',
            1: 'Decision Tree Classifier',
            2: 'Random Forest Classifier',
            3: 'Support Vector Machine'}

In [None]:
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_grid_search_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: Logistic Regression
Best params are : {'max_iter': 1000, 'multi_class': 'multinomial'}
Best training accuracy: 0.902
Test set accuracy score for best params: 0.907 

Estimator: Decision Tree Classifier
Best params are : {'criterion': 'gini', 'max_depth': 3}
Best training accuracy: 0.902
Test set accuracy score for best params: 0.910 

Estimator: Random Forest Classifier
Best params are : {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 6}
Best training accuracy: 0.906
Test set accuracy score for best params: 0.912 

Estimator: Support Vector Machine


## Visualization of GridSearchCV parameters