In [None]:
import os

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from skimage.io import imread
from skimage.transform import resize

from sklearn.utils import shuffle

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

# Сreating an array with information about each pixel of each image

In [None]:
input_path_train = '../data/data_base/train/'
input_path_test = '../data/data_base/test/'

categories = ['0_water', '5_alcohol', '12.5_alcohol', '25_alcohol', '50_alcohol', '75_alcohol', '96_alcohol']

### Train dataset

In [None]:
data_train = []
labels_train = []

for category_idx, category in tqdm(enumerate(categories)):
    print(category)
    for file in os.listdir(os.path.join(input_path_train, category)):
        img_path = os.path.join(input_path_train, category, file)
        img = imread(img_path)
        img = resize(img, (64, 64))
        data_train.append(img.flatten())
        labels_train.append(category_idx)

data_train = np.asarray(data_train)
labels_train = np.asarray(labels_train)

### Test dataset

In [None]:
data_test = []
labels_test = []

for category_idx, category in tqdm(enumerate(categories)):
    print(category)
    for file in os.listdir(os.path.join(input_path_test, category)):
        img_path = os.path.join(input_path_test, category, file)
        img = imread(img_path)
        img = resize(img, (64, 64))
        data_test.append(img.flatten())
        labels_test.append(category_idx)

data_test = np.asarray(data_test)
labels_test = np.asarray(labels_test)

# Creating a dataframe by concatenating two arrays

### Train dataset

In [None]:
df1 = pd.DataFrame(data_train)

df2 = pd.DataFrame(labels_train)
df2.rename(columns={df2.columns[0]: 'Concentration Index'}, inplace=True)

df = pd.concat([df1, df2], axis=1)

df.to_csv('../../data/tables/pixels_table_train.csv')

### Test dataset

In [None]:
df1 = pd.DataFrame(data_test)

df2 = pd.DataFrame(labels_test)
df2.rename(columns={df2.columns[0]: 'Concentration Index'}, inplace=True)

df = pd.concat([df1, df2], axis=1)

df.to_csv('../../data/tables/pixels_table_test.csv')

# Getting Started with Machine Learning Models

In [None]:
df_train = pd.read_csv('image_pixels_table_train.csv')
df_train = shuffle(df_train)

df_test = pd.read_csv('image_pixels_table_test.csv')
df_test = shuffle(df_test)

In [None]:
X_train = df_train.drop(['Concentration Index', 'Unnamed: 0'], axis=1)
y_train = df_train[df_train.columns[-1]].astype('int')

X_test = df_test.drop(['Concentration Index', 'Unnamed: 0'], axis=1)
y_test = df_test[df_test.columns[-1]].astype('int')

* ### Scikit-Learn

- - RandomForestClassifier

In [None]:
RandomForestClassifier_cls = RandomForestClassifier()

parameters_random_forest_classifier = {'bootstrap': [True, False],
                                        'max_depth': [None, 5, 10],
                                        'max_features': ['sqrt'],
                                        'criterion' :['gini', 'entropy'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 500]}

RandomForestClassifier_model = GridSearchCV(estimator = RandomForestClassifier_cls, param_grid = parameters_random_forest_classifier, 
                            cv=2, n_jobs=-1, verbose = 2)
RandomForestClassifier_model.fit(X_train, y_train)

RandomForestClassifier_great_params = RandomForestClassifier_model.best_params_

In [None]:
pred_RandomForestClassifier_model = RandomForestClassifier_model.predict(X_test)
print(metrics.classification_report(y_test, pred_RandomForestClassifier_model))

In [None]:
cm_RandomForestClassifier = metrics.confusion_matrix(pred_RandomForestClassifier_model,y_test)
cm_normalizes_RandomForestClassifier = np.round(cm_RandomForestClassifier/np.sum(cm_RandomForestClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_RandomForestClassifier = sns.heatmap(cm_normalizes_RandomForestClassifier, cmap="OrRd_r", annot=True)
cm_plot_RandomForestClassifier.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_RandomForestClassifier.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("RandomForestClassifier Confusion Matrix")
plt.savefig('../../data/graphs/RandomForestClassifier_cm_pixels.png')
plt.show()

* * DesicionTreeClassifier

In [None]:
DecisionTreeClassifier_cls = DecisionTreeClassifier()

parameters_decision_tree_classifier = {'max_depth': [None,5,10],
                                        'min_samples_split': [2,5,10],
                                        'min_samples_leaf': [1,2,4]}

DecisionTreeClassifier_model = GridSearchCV(estimator = DecisionTreeClassifier_cls, param_grid = parameters_decision_tree_classifier, 
                            cv=2, n_jobs=-1, verbose = 2)
DecisionTreeClassifier_model.fit(X_train, y_train)

DecisionTreeClassifier_great_params = DecisionTreeClassifier_model.best_params_

In [None]:
pred_DecisionTreeClassifier_model = DecisionTreeClassifier_model.predict(X_test)
print(metrics.classification_report(y_test, pred_DecisionTreeClassifier_model))

In [None]:
cm_DecisionTreeClassifier = metrics.confusion_matrix(pred_DecisionTreeClassifier_model,y_test)
cm_normalizes_DecisionTreeClassifier = np.round(cm_DecisionTreeClassifier/np.sum(cm_DecisionTreeClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_DecisionTreeClassifier = sns.heatmap(cm_normalizes_DecisionTreeClassifier, cmap="OrRd_r", annot=True)
cm_plot_DecisionTreeClassifier.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_DecisionTreeClassifier.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("DecisionTreeClassifier Confusion Matrix")
plt.savefig('../../data/graphs/DecisionTreeClassifier_cm_pixels.png')
plt.show()

* * KNeighborsClassifier

In [None]:
KNeighborsClassifier_cls = KNeighborsClassifier()

parameters_kneighbors_classifier = {'n_neighbors': [3, 5, 7],
                                    'weights': ['uniform', 'distance'],
                                    'p': [1, 2]}

KNeighborsClassifier_model = GridSearchCV(estimator = KNeighborsClassifier_cls, param_grid = parameters_kneighbors_classifier, 
                            cv=2, n_jobs=-1, verbose = 2)
KNeighborsClassifier_model.fit(X_train, y_train)

KNeighborsClassifier_great_params = KNeighborsClassifier_model.best_params_

In [None]:
pred_KNeighborsClassifier_model = KNeighborsClassifier_model.predict(X_test)
print(metrics.classification_report(y_test, pred_KNeighborsClassifier_model))

In [None]:
cm_KNeighborsClassifier = metrics.confusion_matrix(pred_KNeighborsClassifier_model,y_test)
cm_normalizes_KNeighborsClassifier = np.round(cm_KNeighborsClassifier/np.sum(cm_KNeighborsClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_KNeighborsClassifier = sns.heatmap(cm_normalizes_KNeighborsClassifier, cmap="OrRd_r", annot=True)
cm_plot_KNeighborsClassifier.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_KNeighborsClassifier.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("KNeighborsClassifier Confusion Matrix")
plt.savefig('../../data/graphs/KNeighborsClassifier_cm_pixels.png')
plt.show()

* ### CatBoost

In [None]:
CatBoostClassifier_cls = CatBoostClassifier(task_type='GPU')

parameters_catboost_classifier = {'iterations': [300, 500, 750],
                                    'learning_rate': [0.01, 0.1, 0.5],
                                    'depth': [3, 5, 7],
                                    'l2_leaf_reg': [1, 3, 5],
                                    'border_count': [32, 64, 128]}

CatBoostClassifier_model = GridSearchCV(estimator = CatBoostClassifier_cls, param_grid = parameters_catboost_classifier, 
                            cv=2, n_jobs=-1, verbose = 2)
CatBoostClassifier_model.fit(X_train, y_train)

CatBoostClassifier_great_params = CatBoostClassifier_model.best_params_

In [None]:
pred_CatBoostClassifier_model = CatBoostClassifier_model.predict(X_test)
print(metrics.classification_report(y_test, pred_CatBoostClassifier_model))

In [None]:
cm_CatBoostClassifier = metrics.confusion_matrix(pred_CatBoostClassifier_model,y_test)
cm_normalizes_CatBoostClassifier = np.round(cm_CatBoostClassifier/np.sum(cm_CatBoostClassifier,axis=1).reshape(-1, 1), 2)

cm_plot_CatBoostClassifier = sns.heatmap(cm_normalizes_CatBoostClassifier, cmap="OrRd_r", annot=True)
cm_plot_CatBoostClassifier.yaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])
cm_plot_CatBoostClassifier.xaxis.set_ticklabels(["0%", "5%", "12.5%", "25%", "50%", "75%", "96%"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("CatBoostClassifier Confusion Matrix")
plt.savefig('../../data/graphs/CatBoostClassifier_cm_pixels.png')
plt.show()