In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.neighbors import KNeighborsClassifier

In [4]:
data= pd.read_csv("hollink_reproduction.csv")
data = data.drop_duplicates(subset="Synset", keep='first', inplace=False, ignore_index=True)
data["class"]=0
data.loc[data["Labels"]=="b","class"]=1
data= data[data["Labels"]!="none"]
data= data.dropna()
X= data.iloc[:,2:-1]
y= data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=15,shuffle=True)

In [5]:
def Classification_Report(confusion_matrix):
    test_size = sum(sum(confusion_matrix))
    accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/test_size
    precision = confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[0][1])
    recall =    confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])
    f1_score= 2*(precision*recall)/(precision+recall)   
    
    return [accuracy, precision,recall,f1_score]

In [6]:

N=10
score_table = np.zeros(shape=(N,4))
confM_table = np.zeros(shape=(N,2,2))
cwd= os.getcwd()
model_folder= cwd+'/'+'hollink_reproduced_models/'


# Random Forest

In [68]:

grid_param = {
     'max_depth': [3,5,10,15,20,30],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

for ind in range(N):
    

    rforestClassifier= RandomForestClassifier()
    gd_sr = GridSearchCV(estimator=rforestClassifier,
                         param_grid=grid_param,
                         scoring='accuracy',
                         cv=10,
                         n_jobs=-1)

    gd_sr.fit(X_train, y_train)
    best_parameters = gd_sr.best_params_
    testforestClassifier= RandomForestClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"], bootstrap= best_parameters["bootstrap"])
    testforestClassifier.fit(X_train, y_train)
    #########################################################################
    # Save model here
    filename= model_folder+'rf_'+ str(ind+1)+'.pickle'
    pickle.dump(testforestClassifier, open(filename, "wb"))
    #########################################################################    
    y_predict= testforestClassifier.predict(X_test)
    confM=confusion_matrix(y_test, y_predict)
    aprf=Classification_Report(confM)
    confM_table[ind,:,:]=confM
    score_table[ind,:]=aprf

    

'\ngrid_param = {\n     \'max_depth\': [3,5,10,15,20,30],\n    \'criterion\': [\'gini\', \'entropy\'],\n    \'bootstrap\': [True, False]\n}\n\nfor ind in range(N):\n    \n\n    rforestClassifier= RandomForestClassifier()\n    gd_sr = GridSearchCV(estimator=rforestClassifier,\n                         param_grid=grid_param,\n                         scoring=\'accuracy\',\n                         cv=10,\n                         n_jobs=-1)\n\n    gd_sr.fit(X_train, y_train)\n    best_parameters = gd_sr.best_params_\n    testforestClassifier= RandomForestClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"], bootstrap= best_parameters["bootstrap"])\n    testforestClassifier.fit(X_train, y_train)\n    #########################################################################\n    # Save model here\n    filename= model_folder+\'rf_\'+ str(ind+1)+\'.pickle\'\n    pickle.dump(testforestClassifier, open(filename, "wb"))\n    #################################

# Decision Tree Classifier

In [69]:

grid_param = {
     'max_depth': [3,5,10,15,20,30],
    'criterion': ['gini', 'entropy']
}
dTreeClassifier = DecisionTreeClassifier()

for ind in range(N):
        gd_sr = GridSearchCV(estimator=dTreeClassifier,
                             param_grid=grid_param,
                             scoring='accuracy',
                             cv=10,
                             n_jobs=-1)

        gd_sr.fit(X_train, y_train)
        best_parameters = gd_sr.best_params_
        testTreeClassifier= DecisionTreeClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"])
        testTreeClassifier.fit(X_train, y_train)
      
        #########################################################################
        # Save model here
        filename= model_folder+'dt_'+ str(ind+1)+'.pickle'
        pickle.dump(testTreeClassifier, open(filename, "wb"))
        #########################################################################    
        y_predict= testTreeClassifier.predict(X_test)
        confM=confusion_matrix(y_test, y_predict)
        aprf=Classification_Report(confM)
        confM_table[ind,:,:]=confM
        score_table[ind,:]=aprf


'\ngrid_param = {\n     \'max_depth\': [3,5,10,15,20,30],\n    \'criterion\': [\'gini\', \'entropy\']\n}\ndTreeClassifier = DecisionTreeClassifier()\n\nfor ind in range(N):\n        gd_sr = GridSearchCV(estimator=dTreeClassifier,\n                             param_grid=grid_param,\n                             scoring=\'accuracy\',\n                             cv=10,\n                             n_jobs=-1)\n\n        gd_sr.fit(X_train, y_train)\n        best_parameters = gd_sr.best_params_\n        testTreeClassifier= DecisionTreeClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"])\n        testTreeClassifier.fit(X_train, y_train)\n      \n        #########################################################################\n        # Save model here\n        filename= model_folder+\'dt_\'+ str(ind+1)+\'.pickle\'\n        pickle.dump(testTreeClassifier, open(filename, "wb"))\n        ##################################################################

# SVM

In [80]:


svmClassifier = SVC()
grid_param = {
     "C": [0.1,1,10],
    'kernel': ['linear',"poly","rbf","sigmoid"]
}


for ind in range(10):

            gd_sr = GridSearchCV(estimator=svmClassifier,
                                 param_grid=grid_param,
                                 scoring='accuracy',
                                 cv=10,
                                 n_jobs=-1)
            gd_sr.fit(X_train, y_train)
            best_parameters = gd_sr.best_params_
            testSVMClassifier= SVC(C=best_parameters["C"], kernel=best_parameters["kernel"])
            testSVMClassifier.fit(X_train, y_train)
            #########################################################################
            # Save model here
            filename= model_folder+'svm_'+ str(ind+1)+'.pickle'
            pickle.dump(testSVMClassifier, open(filename, "wb"))
            #########################################################################    
            y_predict= testSVMClassifier.predict(X_test)
            confM=confusion_matrix(y_test, y_predict)
            aprf=Classification_Report(confM)
            confM_table[ind,:,:]=confM
            score_table[ind,:]=aprf


# LDA

In [None]:


ldaClassifier = LinearDiscriminantAnalysis()
grid_param = {
     "solver": ["svd", "lsqr", "eigen"]
}

for ind in range(N):

            gd_sr = GridSearchCV(estimator=ldaClassifier,
                                 param_grid=grid_param,
                                 scoring='accuracy',
                                 cv=10,
                                 n_jobs=-1)
            gd_sr.fit(X_train, y_train)
            best_parameters = gd_sr.best_params_
            testLDAClassifier= LinearDiscriminantAnalysis(solver=best_parameters["solver"])
            testLDAClassifier.fit(X_train, y_train)
            #########################################################################
            # Save model here
            filename= model_folder+'lda_'+ str(ind+1)+'.pickle'
            pickle.dump(testLDAClassifier, open(filename, "wb"))
            #########################################################################    
            y_predict= testLDAClassifier.predict(X_test)
            confM=confusion_matrix(y_test, y_predict)
            aprf=Classification_Report(confM)
            confM_table[ind,:,:]=confM
            score_table[ind,:]=aprf


## KNN

In [7]:


knn = KNeighborsClassifier()
grid_param = {
    'n_neighbors': [3, 5, 7,11],  # Different values of K (number of neighbors)
    'weights': ['uniform', 'distance'],  # Weighting schemes for neighbors
}


for ind in range(N):

            gd_sr = GridSearchCV(estimator=knn,
                                 param_grid=grid_param,
                                 scoring='accuracy',
                                 cv=10,
                                 n_jobs=-1)
            gd_sr.fit(X_train, y_train)
            best_parameters = gd_sr.best_params_
            testKNNClassifier= KNeighborsClassifier(n_neighbors=best_parameters["n_neighbors"], weights=best_parameters["weights"])
            testKNNClassifier.fit(X_train, y_train)
            #########################################################################
            # Save model here
            filename= model_folder+'knn_'+ str(ind+1)+'.pickle'
            pickle.dump(testKNNClassifier, open(filename, "wb"))
            #########################################################################    
            y_predict= testKNNClassifier.predict(X_test)
            confM=confusion_matrix(y_test, y_predict)
            aprf=Classification_Report(confM)
            confM_table[ind,:,:]=confM
            score_table[ind,:]=aprf
