In [None]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from catboost import CatBoostClassifier

from sklearn.metrics import classification_report

# Loading and preparing dataframes for fiiting in model

In [None]:
train_df = pd.read_csv('../../data/ResultTrain.csv')
train_df = shuffle(train_df)

test_df = pd.read_csv('../../data/ResultTest.csv')
test_df = shuffle(test_df)

X_train = train_df.drop(['Concentration','Photo Name'], axis=1)
y_train = train_df[train_df.columns[1]].astype('int')

X_test = test_df.drop(['Concentration', 'Photo Name'], axis=1)
y_test = test_df[test_df.columns[1]].astype('int')

# Training Classification Machine Learning Models

* ### Scikit-Learn

* * RandomForestClassifier

In [None]:
RandomForestClassifier_classifier = RandomForestClassifier()

param_grid_RandomForest_classifier = {'bootstrap': [True, False],
                                    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, None],
                                    'min_samples_leaf': [1, 2, 4],
                                    'min_samples_split': [2, 5, 10],
                                    'n_estimators': [200, 400, 600, 800, 1000, 1200]}

RandomForestClassifier_model = GridSearchCV(estimator = RandomForestClassifier_classifier, 
                                            param_grid = param_grid_RandomForest_classifier, 
                                            cv = 3,
                                            scoring = "accuracy", 
                                            n_jobs = -1, 
                                            verbose = 2)

RandomForestClassifier_model.fit(X_train, y_train)

In [None]:
RandomForestClassifier_great_params = RandomForestClassifier_model.best_params_
RandomForestClassifier_great_params

In [None]:
pred_test_RandomForestClassifier = RandomForestClassifier_model.predict(X_test)
print(classification_report(y_test.values, pred_test_RandomForestClassifier))

In [None]:
cm_RandomForestClassifier = metrics.confusion_matrix(pred_test_RandomForestClassifier,y_test)
cm_normalizes_RandomForestClassifier = np.round(cm_RandomForestClassifier/np.sum(cm_RandomForestClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_GradientBoosting = sns.heatmap(cm_normalizes_RandomForestClassifier, cmap="OrRd_r", annot=True)
cm_plot_GradientBoosting.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_GradientBoosting.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("RandomForestClassifier Confusion Matrix")
plt.savefig('../../data/graphs/RandomForestClassifier_cm')
plt.show()

* * GradientBoostingClassifier

In [None]:
GradientBoostingClassifier_classifier = GradientBoostingClassifier()

parameters_gradient_boosting_classifier = {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25],
                                           "min_samples_split": np.linspace(0.1, 0.5, 12),
                                           "min_samples_leaf": np.linspace(0.1, 0.5, 12),
                                           "max_depth":[3,5,8],
                                           "max_features":["log2","sqrt"],
                                           "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
                                           "n_estimators":[1, 5, 10]
                                          }

GradientBoostingClassifier_model = GridSearchCV(GradientBoostingClassifier_classifier, 
                                                parameters_gradient_boosting_classifier,
                                                scoring = "accuracy", 
                                                cv=3, 
                                                verbose=2, 
                                                n_jobs=-1)

GradientBoostingClassifier_model.fit(X_train, y_train)

In [None]:
GradientBoostingClassifier_great_params = GradientBoostingClassifier_model.best_params_
GradientBoostingClassifier_great_params

In [None]:
pred_GradientBoostingClassifier_model = GradientBoostingClassifier_model.predict(X_test)
print(classification_report(y_test.values, pred_GradientBoostingClassifier_model))

In [None]:
cm_GradientBoostingClassifier = metrics.confusion_matrix(pred_GradientBoostingClassifier_model,y_test)
cm_normalizes_GradientBoostingClassifier = np.round(cm_GradientBoostingClassifier/np.sum(cm_GradientBoostingClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_GradientBoosting = sns.heatmap(cm_normalizes_GradientBoostingClassifier, cmap="OrRd_r", annot=True)
cm_plot_GradientBoosting.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_GradientBoosting.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("GradientBoostingClassifier Confusion Matrix")
plt.savefig('../../data/graphs/GradientBoostingClassifier_cm')
plt.show()

* * KNeighborsClassifier

In [None]:
KNeighbors_classifier = KNeighborsClassifier()

weight_options_kneighbors = ['uniform', 'distance']
k_range = list(range(1, 100))
metric_options_kneighbors = ['euclidean', 'manhattan', 'cityblock', 'cosine', 'l1', 'l2', 'nan_euclidean']

parameters_kneighbors_classifier = dict(n_neighbors = k_range, weights = weight_options_kneighbors, metric = metric_options_kneighbors)

KNeighbors_model = GridSearchCV(estimator = KNeighbors_classifier, 
                                param_grid = parameters_kneighbors_classifier,
                                scoring = "accuracy",
                                cv=3, 
                                verbose=2, 
                                n_jobs=-1)

KNeighbors_model.fit(X_train, y_train)

In [None]:
KNeighbors_model_great_params = KNeighbors_model.best_params_
KNeighbors_model_great_params

In [None]:
pred_KNeighbors_model = KNeighbors_model.predict(X_test)
print(classification_report(y_test.values, pred_KNeighbors_model))

In [None]:
cm_KNeighbors = metrics.confusion_matrix(pred_KNeighbors_model,y_test)
cm_normalizes_KNeighbors = np.round(cm_KNeighbors/np.sum(cm_KNeighbors,axis=1).reshape(-1, 1), 2)

cm_plot_KNeighbors=sns.heatmap(cm_normalizes_KNeighbors, cmap="OrRd_r", annot=True)
cm_plot_KNeighbors.yaxis.set_ticklabels(["0%", "5%","12.5%", "25%", "50%", "75%", "96%"])
cm_plot_KNeighbors.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("KNeighborsClassifier Confusion Matrix")
plt.savefig('../../data/graphs/KNeighborsClassifier_cm')
plt.show()

* ### CatBoost

* * CatBoostClassifier

In [None]:
CatBoostClassifier_classifier = CatBoostClassifier()

catboost_params = {"max_depth": [1,2,3,5,7,9],
                    "learning_rate": [0.001, 0.01, 0.05, 0.1, 0.3],
                    "iterations": [10, 50, 100, 200, 300, 400, 500, 700, 1000, 1500],
                    "border_count":[5, 10, 20, 50, 100, 200, 250],
                    "l2_leaf_reg": [1,3,5,10,50,100,150],}

CatBoostClassifier_model = GridSearchCV(estimator=CatBoostClassifier_classifier,
                           param_grid=catboost_params, 
                           n_jobs= -1,
                           cv=3,
                           scoring='accuracy',
                           error_score=0)

CatBoostClassifier_model.fit(X_train, y_train)

In [None]:
pred_CatBoostClassifier_model = CatBoostClassifier_model.predict(X_test)
print(classification_report(y_test.values, pred_CatBoostClassifier_model))

In [None]:
cm_CatBoostClassifier = metrics.confusion_matrix(pred_CatBoostClassifier_model,y_test)
cm_normalizes_CatBoostClassifier = np.round(cm_CatBoostClassifier/np.sum(cm_CatBoostClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_CatBoostClassifier=sns.heatmap(cm_normalizes_CatBoostClassifier, cmap="OrRd_r", annot=True)
cm_plot_CatBoostClassifier.yaxis.set_ticklabels(["0%", "5%","12.5%", "25%", "50%", "75%", "96%"])
cm_plot_CatBoostClassifier.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("CatBoostClassifierClassifier Confusion Matrix")
plt.savefig('../../data/graphs/CatBoostClassifierClassifier_cm')
plt.show()