In [1]:
import pandas as pd
import numpy as np
import pickle
import time

from sklearn.model_selection import GridSearchCV
from mahalanobis import MahalanobisClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
train = pd.read_csv("train_set.csv")

train_samps = train.drop(["aerosol_type", "date"], axis=1)
train_labels = train["aerosol_type"]

## Model 1 - Mahalanobis Distance Classifier (MDC)

In [3]:
# Fit Mahalanobis Distance Classifier model
mdc = MahalanobisClassifier(train_samps, train_labels)

# Save model to pickle file
mdcPickle = open('models/mdc_pickle', 'wb') 
pickle.dump(mdc, mdcPickle)  
mdcPickle.close()

## Model 2A - k-Nearest Neighbors (KNN)

In [4]:
# Fit k-Nearest Neighbors Classifier model
knn = KNeighborsClassifier().fit(train_samps, train_labels) 

# Save model to pickle file
knnPickle = open('models/knn_pickle', 'wb') 
pickle.dump(knn, knnPickle)  
knnPickle.close()

## Model 2B - k-Nearest Neighbors with Hyperparameter Tuning (KNN-tuned)

In [5]:
# Tune hyperparameter(s): n_neighbors
knn_gscv = KNeighborsClassifier()
param_grid_knn = {"n_neighbors": np.arange(1, 25)}

knn_gscv = GridSearchCV(knn_gscv, param_grid_knn, cv=5)
knn_gscv.fit(train_samps, train_labels)
knn_gscv.best_params_

{'n_neighbors': 1}

In [6]:
# Fit k-Nearest Neighbors Classifier model with hyperparameter tuning
knn_tuned = KNeighborsClassifier(**knn_gscv.best_params_).fit(train_samps, train_labels) 

# Save model to pickle file
knntunedPickle = open('models/knn_tuned_pickle', 'wb') 
pickle.dump(knn_tuned, knntunedPickle)  
knntunedPickle.close()

## Model 3A - Naive Bayes Classifier 

In [7]:
# Fit Naive Bayes Classifier model
gnb = GaussianNB().fit(train_samps, train_labels)

# Save model to pickle file
gnbPickle = open('models/gnb_pickle', 'wb') 
pickle.dump(gnb, gnbPickle)  
gnbPickle.close()

## Model 3B - Naive Bayes Classifier with Hyperparameter Tuning (NB-tuned)

In [None]:
# Tune hyperparameter(s): var_smoothing
gnb_gscv = GaussianNB()
param_grid_gnb = {"var_smoothing": np.logspace(0, -9, num=100)}

gnb_gscv = GridSearchCV(gnb_gscv, param_grid_gnb, cv=5)
gnb_gscv.fit(train_samps, train_labels)
gnb_gscv.best_params_

In [None]:
# Fit Naive Bayes Classifier model with hyperparameter tuning
gnb_tuned = GaussianNB(**gnb_gscv.best_params_).fit(train_samps, train_labels) 

# Save model to pickle file
gnbtunedPickle = open('models/gnb_tuned_pickle', 'wb') 
pickle.dump(gnb_tuned, gnbtunedPickle)  
gnbtunedPickle.close()

### References

#### Mahalanobis Distance Classifier
- https://github.com/mavroudisv/Mahalanobis-Classifier/blob/master/main.py
- https://www.machinelearningplus.com/statistics/mahalanobis-distance/
- https://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/PR_Mahal/M_metric.htm
- https://scikit-learn.org/stable/auto_examples/covariance/plot_mahalanobis_distances.html
- https://stackoverflow.com/questions/34643548/how-to-use-mahalanobis-distance-in-sklearn-distancemetrics/34650347#34650347

#### k-Nearest Neighbors Classifier
- https://python-course.eu/machine-learning/k-nearest-neighbor-classifier-with-sklearn.php

#### Naive Bayes Classifier
- https://www.geeksforgeeks.org/multiclass-classification-using-scikit-learn/#:~:text=%23%20training%20a%20Naive%20Bayes%20classifier