In [1]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', 55)
pd.set_option('max_colwidth', 100)
pd.set_option('max_rows', 600)

In [2]:
import numpy as np

import itertools
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import StackingClassifier # <-- note: this is not from sklearn!


from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, fbeta_score

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score, KFold
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn import svm, datasets

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_data = pd.read_pickle('./data/df_steel_grades.pkl')

In [5]:
X = df_data.iloc[:, :-1].to_numpy()
y = df_data.iloc[:, -1:].to_numpy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                        random_state=42)

In [7]:
def kfold_test(model, X_train, y_train, n_val=5, shuffle_val=True, random_val=42, average_val='micro', beta_val = 0.5):
    kf = KFold(n_splits=n_val, shuffle=shuffle_val, random_state=random_val)
    model_results = []
    for train_ind, val_ind in kf.split(X_train,y_train):

        X_train_k, y_train_k = X_train[train_ind], y_train[train_ind]
        X_val, y_val = X_train[val_ind], y_train[val_ind] 

        model.fit(X_train_k, y_train_k)
        model_results.append(fbeta_score(model.predict(X_val), y_val, average=average_val, beta=beta_val))
    return model_results
    

In [8]:
model = KNeighborsClassifier(
    n_neighbors=3,
    weights='distance',
    algorithm='auto',
    n_jobs=-1)

### Precision Micro
**Precision** values minimal false positives.  In mfg, rather over spec a product than risk production failure

**Micro** Imbalanced dataset with multiple classes.  Aggregate the contributions of all classes to compute the average metric.

In [9]:
res = kfold_test(model, X_train, y_train, n_val=5, shuffle_val=True, random_val=42)

In [10]:
res

[0.8658536585365854,
 0.8170731707317073,
 0.8780487804878049,
 0.8518518518518519,
 0.8641975308641975]

In [11]:
np.mean(res)

0.8554049984944294

### SMOTE - Synthetic Minority Oversampling TEchnique

In [12]:
from imblearn.over_sampling import SMOTE
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)

Using TensorFlow backend.


In [13]:
from collections import Counter

In [14]:
Counter(y_smoted)

Counter({'Austenitic': 193,
         'Duplex': 193,
         'Ferritic': 193,
         'Superaustenitic': 193,
         'Martensitic': 193})

In [15]:
res_smote = kfold_test(model, X_smoted, y_smoted, n_val=5, shuffle_val=True, random_val=42)

In [16]:
res_smote

[0.9792746113989637,
 0.9740932642487047,
 0.9326424870466321,
 0.9533678756476682,
 0.9585492227979274]

In [17]:
np.mean(res_smote)

0.9595854922279792

### GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [19]:
hyperparameters = {'n_neighbors' : range(1,50),
             'algorithm':['auto', 'ball_tree','kd_tree','brute'], 
             'weights':['uniform', 'distance']}

In [20]:
model_results = []
grid_srch = RandomizedSearchCV(model, hyperparameters, n_iter=100, cv=5, scoring='precision_micro', iid=False)
grid_srch.fit(X_smoted, y_smoted)
print(grid_srch.best_score_)
param = grid_srch.best_params_

0.9606477732793521


In [21]:
param

{'weights': 'distance', 'n_neighbors': 2, 'algorithm': 'brute'}

### KNN with Grid Search Results

In [22]:
model = KNeighborsClassifier(
    n_neighbors=param['n_neighbors'],
    weights=param['weights'],
    algorithm=param['algorithm'],
    n_jobs=-1)


In [23]:
gridsearch_res = kfold_test(model, X_smoted, y_smoted, n_val=5, shuffle_val=True, random_val=42)

In [24]:
gridsearch_res

[0.9740932642487047,
 0.9637305699481866,
 0.9481865284974094,
 0.9689119170984457,
 0.9637305699481866]

In [25]:
np.mean(gridsearch_res)

0.9637305699481866

### Adasyn

In [26]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X,y)

In [27]:
adasyn_res = kfold_test(model, X_adasyn, y_adasyn, n_val=5, shuffle_val=True, random_val=42)

In [28]:
adasyn_res

[0.9653679653679653,
 0.9437229437229436,
 0.9565217391304348,
 0.9434782608695652,
 0.9173913043478261]

In [29]:
np.mean(adasyn_res)

0.945296442687747

In [31]:
model_results = []
grid_srch = RandomizedSearchCV(model, hyperparameters, n_iter=50, cv=5, scoring='precision_micro', iid=False)
grid_srch.fit(X_adasyn, y_adasyn)
print(grid_srch.best_score_)
param = grid_srch.best_params_

0.8809989383597114


In [32]:
param

{'weights': 'distance', 'n_neighbors': 3, 'algorithm': 'ball_tree'}

In [34]:
model = KNeighborsClassifier(
    n_neighbors=param['n_neighbors'],
    weights=param['weights'],
    algorithm=param['algorithm'],
    n_jobs=-1)


In [35]:
gridsearch_adasyn_res = kfold_test(model, X_adasyn, y_adasyn, n_val=5, shuffle_val=True, random_val=42)

In [36]:
gridsearch_adasyn_res

[0.9523809523809524,
 0.9393939393939394,
 0.9434782608695652,
 0.9391304347826086,
 0.9347826086956522]

In [37]:
np.mean(gridsearch_adasyn_res)

0.9418332392245435