In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from mahalanobis import MahalanobisClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [2]:
train = pd.read_csv("AI201-Aerosol-Classification/train_set.csv")
valid = pd.read_csv("AI201-Aerosol-Classification/valid_set.csv")

In [3]:
train_samps = train.drop(["aerosol_type", "date"], axis=1)
train_labels = train["aerosol_type"]
valid_samps = valid.drop(["aerosol_type", "date"], axis=1)
valid_labels = valid["aerosol_type"]

## I. Mahalanobis Distance Classifier

References:

https://github.com/mavroudisv/Mahalanobis-Classifier/blob/master/main.py
https://www.machinelearningplus.com/statistics/mahalanobis-distance/
https://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/PR_Mahal/M_metric.htm
https://scikit-learn.org/stable/auto_examples/covariance/plot_mahalanobis_distances.html
https://stackoverflow.com/questions/34643548/how-to-use-mahalanobis-distance-in-sklearn-distancemetrics/34650347#34650347

In [4]:
clf = MahalanobisClassifier(train_samps, train_labels)
valid_pred_mdc = clf.predict_class(valid_samps)

In [5]:
print("(1) Mahalanobis Distance Classifier (MDC) Accuracy -", accuracy_score(valid_labels, valid_pred_mdc))

(1) Mahalanobis Distance Classifier (MDC) Accuracy - 0.8116120113662545


## II. k-Nearest Neighbors

References:

https://python-course.eu/machine-learning/k-nearest-neighbor-classifier-with-sklearn.php

In [6]:
knn = KNeighborsClassifier().fit(train_samps, train_labels) 
valid_pred_knn = knn.predict(valid_samps)

In [7]:
print("(2A) k-Nearest Neighbors (KNN) Accuracy -", accuracy_score(valid_labels, valid_pred_knn))

(2A) k-Nearest Neighbors (KNN) Accuracy - 0.9047692189539974


#### Miscellaneous - Hyperparameter Tuning

In [8]:
knn_gscv = KNeighborsClassifier()
param_grid_knn = {"n_neighbors": np.arange(1, 25)}

knn_gscv = GridSearchCV(knn_gscv, param_grid_knn, cv=5)
knn_gscv.fit(train_samps, train_labels)
knn_gscv.best_params_

{'n_neighbors': 1}

In [9]:
knn_tuned = KNeighborsClassifier(**knn_gscv.best_params_).fit(train_samps, train_labels) 
valid_pred_knn_tuned = knn_tuned.predict(valid_samps)

print("(2B) k-Nearest Neighbors (KNN) Tuned Accuracy -", accuracy_score(valid_labels, valid_pred_knn_tuned))

(2B) k-Nearest Neighbors (KNN) Tuned Accuracy - 0.981414637892635


## III. Naive Bayes Classifier 

References:

https://www.geeksforgeeks.org/multiclass-classification-using-scikit-learn/#:~:text=%23%20training%20a%20Naive%20Bayes%20classifier

In [10]:
gnb = GaussianNB().fit(train_samps, train_labels)
valid_pred_gnb = gnb.predict(valid_samps)

In [11]:
print("(3A) Naive Bayes Accuracy -", accuracy_score(valid_labels, valid_pred_gnb))

(3A) Naive Bayes Accuracy - 0.7277474848321942


#### Miscellaneous - Hyperparameter Tuning

In [12]:
gnb_gscv = GaussianNB()
param_grid_gnb = {"var_smoothing": np.logspace(0, -9, num=100)}

gnb_gscv = GridSearchCV(gnb_gscv, param_grid_gnb, cv=5)
gnb_gscv.fit(train_samps, train_labels)
gnb_gscv.best_params_

{'var_smoothing': 6.579332246575683e-05}

In [13]:
gnb_tuned = GaussianNB(**gnb_gscv.best_params_).fit(train_samps, train_labels) 
valid_pred_gnb_tuned = gnb_tuned.predict(valid_samps)

print("(3B) Naive Bayes Tuned Accuracy -", accuracy_score(valid_labels, valid_pred_gnb_tuned))

(3B) Naive Bayes Tuned Accuracy - 0.7277474848321942
