In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

In [2]:
df = pd.read_csv('../data/CoilData.csv')
coils = pd.read_csv('../data/output.csv')
coil_list = list(map(int,list(coils.columns)))
lst = []
for i in df['coil']:
    if i in coil_list:
        lst.append(1)
    else:
        lst.append(0)
df['contracted'] = lst
df['analyse_main'] = [i[0:4] for i in df['analyse']]
dummies_analyse_main = pd.get_dummies(df['analyse_main'], dtype=float)

### Bad coils exploration

In [3]:
bad_c = df[df['contracted']==1]
bad_c['analyse_main'] = [i[:-2] for i in bad_c['analyse']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_c['analyse_main'] = [i[:-2] for i in bad_c['analyse']]


In [4]:
len_main = len(bad_c.analyse_main.value_counts())
top_ninety_percentile = bad_c.analyse_main.value_counts(ascending=False).head(int(len_main/5)).sum()
the_rest = bad_c.analyse_main.value_counts(ascending=True).head(len_main - int(len_main/5)).sum()
ratio = 1 - (the_rest/top_ninety_percentile)

print(f'The top 20% of most recurring main categories count for {ratio * 100}% of the total number of contracted coils')

The top 20% of most recurring main categories count for 77.17055971793741% of the total number of contracted coils


In [5]:
# get list of 20% most recurring categories
top_bad_main_categories = bad_c.analyse_main.value_counts(ascending=False).head(int(len_main/5)).index.tolist()

# add column if coil in category
in_main_category = []
for coil in bad_c['analyse']:
    if coil[:-2] in top_bad_main_categories:
        in_main_category.append(1)
    else:
        in_main_category.append(0)

bad_c['is_in_main_top_main_categories'] = in_main_category
bad_coil_list = list(bad_c.coil[bad_c['is_in_main_top_main_categories'] == 1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_c['is_in_main_top_main_categories'] = in_main_category


In [6]:
# removing columns
coil_list = df.coil
df = df.drop(columns=['coil', 'analyse', 'analyse_main', 'furnace Number', 'Temperature before finishing mill', 
                      'Temperature after finishing mill'])
data = df.copy()
print(data.head())

   Hardness_1  Hardness_2   Width  Thickness Thickness profile    c    mn  \
0       10003         101  1302.1       4.36                31  355  2162   
1       10123         101  1282.3       4.37                35  551  1985   
2       10040         102  1297.4       4.43                25  457  1895   
3       10243         102  1295.2       4.44                28  697  2008   
4       10012         100  1293.3       3.95                26  477  1936   

    si  nb    p    s   al   ma  b   n  ti   cr  va  mo  contracted  
0   49   0  133  143  304  291  1  34   6  302   0  25           1  
1  101   0  118   90  395  384  1  33  12  189  25   7           0  
2   60   0  108  115  476  463  1  20  11  288   0  40           0  
3   69   0  139   98  306  296  1  21   9  253   0   9           0  
4   52   0  112  121  340  329  1  28   8  297   0  23           0  


In [7]:
# replace string values in Thickness profile column
data['Thickness profile'] = data['Thickness profile'].apply(lambda x: x.replace('*******', ''))
data = data.replace('', np.nan, regex=True).dropna().astype(float)
data = data[data['Thickness profile'] >= 0]

## Transform data

In [8]:
# Standardize data
# scaler = StandardScaler()
# selection_standardize = data.iloc[:,0:5]
# list_columns = selection_standardize.columns
# scaled_selection = pd.DataFrame(data=scaler.fit_transform(selection_standardize), columns=[list_columns])
# print(scaled_selection)

      Hardness_1 Hardness_2     Width Thickness Thickness profile
0      -0.864475  -0.593122  0.143819  0.748022          0.910067
1      -0.775493  -0.593122  0.075519  0.757278          1.273254
2      -0.837039  -0.533462  0.127606  0.812812          0.365286
3      -0.686510  -0.533462  0.120018  0.822067          0.637677
4      -0.857802  -0.652781  0.113463  0.368543          0.456083
...          ...        ...       ...       ...               ...
56506  -1.029093  -0.772100 -0.863782 -1.584390         -0.088698
56507  -1.015746  -0.772100 -0.861713 -1.584390         -0.179494
56508  -0.967547  -0.712441 -0.868612 -1.584390         -0.361088
56509  -0.837780  -0.652781 -0.854469 -1.584390          0.002099
56510  -0.984602  -0.772100 -0.868267 -1.584390         -0.451885

[56511 rows x 5 columns]


In [9]:
# Transform to log data and replace -inf values with min value != 0 
# in each column and get the log of that number divided by 1000
log_selection = data.iloc[:,0:19]
for column in list(log_selection.columns):
    min_value_per_column = min(i for i in log_selection.loc[:,column] if i > 0)
    log_selection.loc[:,column] = np.log(log_selection.loc[:,column]).replace(-np.inf, np.log(min_value_per_column/1000))

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [10]:
# Join dataframes and adding OneHotEncoding for Categorical values of 'Analyse' column
# data = scaled_selection.join(log_selection).join(dummies_analyse_main).join(df['contracted']).join(coil_list).dropna()
data = log_selection.join(dummies_analyse_main).join(df['contracted']).join(coil_list).dropna()

## Data selection and partitioning

In [11]:
# Making balanced datasets
in_bad_coil_list = []
for coil in data['coil']:
    if coil in bad_coil_list:
        in_bad_coil_list.append(1)
    else:
        in_bad_coil_list.append(0)
data['in_bad_coil_list'] = in_bad_coil_list
df_bad_coils = data[data['in_bad_coil_list']==1].reset_index().drop(columns=['coil',
                                                                             'in_bad_coil_list',
                                                                             'index'])
len_bad_coil_list = len(df_bad_coils)
print(len_bad_coil_list)
print(df_bad_coils.head().T)

df_good_coils = data[data.contracted == 0].sample(len_bad_coil_list).reset_index().drop(columns=['coil',
                                                                                                 'in_bad_coil_list',
                                                                                                 'index'])
df_good_coils.to_csv('df_good_coils.csv', index=True, header=True)
df_good_coils = pd.read_csv('df_good_coils.csv').drop(columns=['Unnamed: 0']).dropna()
print(df_good_coils.head().T)

2261
                          0         1          2         3         4
Hardness_1         9.210640  9.213635   9.200189  9.254453  9.222467
Hardness_2         4.615121  4.605170   4.595120  4.653960  4.615121
Width              7.171734  7.375882   7.344525  7.242726  7.170196
Thickness          1.472472  1.420696   1.337629  0.924259  0.841567
Thickness profile  3.433987  1.609438 -11.036691  2.197225  2.484907
...                     ...       ...        ...       ...       ...
TB53               0.000000  0.000000   0.000000  0.000000  0.000000
TB61               0.000000  0.000000   0.000000  0.000000  0.000000
TB63               0.000000  0.000000   0.000000  0.000000  0.000000
TB71               0.000000  0.000000   0.000000  0.000000  0.000000
contracted         1.000000  1.000000   1.000000  1.000000  1.000000

[242 rows x 5 columns]
                          0         1          2         3         4
Hardness_1         9.248310  9.246479   9.448727  9.212139  9.218309
Hardn

In [12]:
# concat even number of good and bad coils in df and reshuffle the dataframe so we randomize the the data when we
# split it. 
df_balanced_coils = pd.concat([df_good_coils, df_bad_coils]).sample(frac=1).reset_index(drop=True).dropna()
X = df_balanced_coils.iloc[:,:-2]
y = df_balanced_coils.contracted
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(len(df_bad_coils), len(df_good_coils))


print(np.any(np.isnan(df_balanced_coils)))

2261 2261
False


## Baseline modeling

In [13]:
lr = LogisticRegression(random_state=42)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=42)
svm = svm.SVC(random_state=42)

In [14]:
# Set grid search params
max_depth_range = np.arange(3,15,3)
max_iter = np.arange(100,1000,100)

grid_params_lr = [{'multi_class': ['auto', 'ovr', 'multinomial'],
                  'max_iter': [1000]}]

grid_params_dt = [{'criterion': ['gini', 'entropy'],
                  'max_depth': max_depth_range}]

grid_params_rf = [{'criterion': ['gini', 'entropy'],
        'max_depth': max_depth_range,
        'min_samples_split': max_depth_range}]

grid_params_svm = [{'kernel': ['linear', 'rbf'], 
        'C': max_depth_range}]

In [15]:
LR = GridSearchCV(lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=3)

DT = GridSearchCV(dt,
                 param_grid=grid_params_dt,
                 scoring='accuracy',
                 cv=3)

RF = GridSearchCV(rf,
                 param_grid=grid_params_rf,
                 scoring='accuracy',
                 cv=3)

SVM = GridSearchCV(svm,
                  param_grid=grid_params_svm,
                  scoring='accuracy',
                  cv=3)

In [16]:
grids = [LR, DT, RF, SVM]

# Creating a dict for our reference
grid_dict = {0: 'Logistic Regression',
            1: 'Decision Tree Classifier',
            2: 'Random Forest Classifier',
            3: 'Support Vector Machine'}

In [17]:
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(x_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_grid_search_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best params are : {'max_iter': 1000, 'multi_class': 'multinomial'}
Best training accuracy: 0.841
Test set accuracy score for best params: 0.855 

Estimator: Decision Tree Classifier
Best params are : {'criterion': 'gini', 'max_depth': 9}
Best training accuracy: 0.841
Test set accuracy score for best params: 0.860 

Estimator: Random Forest Classifier
Best params are : {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 3}
Best training accuracy: 0.858
Test set accuracy score for best params: 0.873 

Estimator: Support Vector Machine
Best params are : {'C': 9, 'kernel': 'linear'}
Best training accuracy: 0.843
Test set accuracy score for best params: 0.848 

Classifier with best test set accuracy: Random Forest Classifier

Saved Random Forest Classifier grid search pipeline to file: best_grid_search_pipeline.pkl


In [18]:
# Now we can create k train-test splits using KFold
from sklearn.model_selection import KFold

# Using KFold instead of calling multiple times train_test_split to ensure that each
# sample goes into a single split only
kf = KFold(n_splits=5, random_state=45, shuffle=True)

split = 0
scores = []
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    result = single_grid_search(X_train, y_train)
    
    decision_tree = result.best_estimator_
    score = decision_tree.score(X_test, y_test)
    scores.append(score)
    print("### Split {}: Accuracy is {:.2f}% ###".format(split := split + 1, score*100))
    
print("The mean generalization accuracy of the model is {:.2f}% (+/- {:.2f}%)".format(np.mean(scores) * 100, np.std(scores) * 100))

KeyError: "None of [Int64Index([   0,    1,    4,    6,    8,    9,   10,   11,   12,   13,\n            ...\n            4509, 4511, 4512, 4513, 4514, 4515, 4516, 4518, 4520, 4521],\n           dtype='int64', length=3617)] are in the [columns]"

## Visualization of GridSearchCV parameters