In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [8]:
data = pd.read_csv("final_data.csv")

In [9]:
#removed unnecessary columns
final_data = data.drop(data.columns[2:5], axis=1)


In [10]:
#split the data into X, the inputs (table of VOC values), and Y, the output (Risk- High or Low)

X = final_data.iloc[:, 2:].copy()
Y = final_data.iloc[:, 1].copy()

In [11]:
#Replacing "High" with 1 and "Low" with 0 in order to later use classification algorithms

for x in range(0, len(Y)):
    if Y[x] == "High":
        Y[x] = 1
    else:
        Y[x] = 0

In [60]:
# X_train: input datframe for training the classifcation models
# X_test: input dataframe for testing the classification models
# Y_train: output datframe for training the classification models
# Y_test: output datframe for testing the classification models

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 19)

In [61]:
#converting output from "object" to int

Y_train = Y_train.astype("int")
Y_test = Y_test.astype("int")

In [62]:
#dictionary containing classifiers that use different algorithms. 
#We will iterate through this dictionary to find the most optimal classifier.
#keys are names, values are instances of the classifiers

dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB()}

In [63]:
#defining a function that fits each classifier in dict_classifiers to X_train and Y_train
#the time spent on fitting the classifier to the data is also recorded
# test_score calculated for each classifier. This is the accuracy score found by using the model with X_test and Y_test
#for each classifier, in the dictionary dict_models, an entry is added. 
#This entry is a dictionary containing the classifier, test_score, and time_spent
#after iterating through dict_classifiers, dict_models is returned

def batch_classify(X_train, Y_train, X_test, Y_test,verbose = True):
  
    
    dict_models = {}
    
    for classifier_name, classifier in list(dict_classifiers.items()):
        t_start = time.perf_counter()
        classifier.fit(X_train, Y_train)
        t_end = time.perf_counter()

        time_spent = t_end - t_start
        test_score = classifier.score(X_test, Y_test)

        dict_models[classifier_name] = {'model': classifier, 'test_score': test_score, 'train_time': time_spent}

    return dict_models

In [64]:
#function to display the classifiers' results in a table, sorted by test_score
#lists made of the classifiers, their corresponding test_scores, and their corresponding training_time


def display_dict_models(dict_models, sort_by='test_score'):
    classifier_list = [x for x in dict_models.keys()]
    test_score = [dict_models[key]['test_score'] for key in classifier_list]
    training_time = [dict_models[key]['train_time'] for key in classifier_list]
    
    #initalized dataframe with correct dimensions and column labels
    table = pd.DataFrame(data=np.zeros(shape=(len(classifier_list),3)), columns = ['classifier', 'test_score', 'train_time'])
    
    #for loop to set the values in each row
    for x in range(0,len(classifier_list)):
        table.loc[x, 'classifier'] = classifier_list[x]
        table.loc[x, 'test_score'] = test_score[x]
        table.loc[x, 'train_time'] = training_time[x]
    
    #function displays the table sorted by test_score, in descending order
    display(table.sort_values(by=sort_by, ascending=False))

In [65]:
#def multiple:
 #   for x in range(10):
   # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
  #  Y_train = Y_train.astype("int")
    #Y_test = Y_test.astype("int")
    
    #batch_classify(X_train, Y_train, X_test, Y_test,verbose = True)
        

In [66]:
dict_models = batch_classify(X_train, Y_train, X_test, Y_test)
display_dict_models(dict_models)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,classifier,test_score,train_time
6,Neural Net,0.684211,0.114065
4,Decision Tree,0.631579,0.002139
7,Naive Bayes,0.631579,0.003241
5,Random Forest,0.578947,1.011021
0,Logistic Regression,0.526316,0.031241
1,Nearest Neighbors,0.526316,0.004719
2,Linear SVM,0.473684,0.004653
3,Gradient Boosting Classifier,0.421053,0.804293
