In [None]:
# Load packages and libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix,accuracy_score,classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
import random
import os
import pickle
import matplotlib.pyplot as plt

In [None]:
# Load the data as pandas dataframe, create feature and label data before splitting train and test 
fname="mills_sph_d50_fs.csv"
data= pd.read_csv(fname,sep= ",")
ncols=len(data.columns)
X= data.iloc[:,:ncols-1]
y= data.iloc[:,-1]


In [None]:
# Takes a model and find the best threshold based on TPR-FPR

class ClassifierWithThreshold:

    def __init__(self,model):
        
        self.model=model

    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return self.model.predict(X)
        else:
            y_scores = self.model.predict_proba(X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold
    
    def threshold_from_optimal_f_score(self, X, y):
        y_scores = self.model.predict_proba(X)[:, 1]
        fpr, tpr, thresholds = roc_curve(y, y_scores) 

        optimal_idx = np.argmax(tpr-fpr)
        
        return thresholds[optimal_idx], tpr[optimal_idx] - fpr[optimal_idx]

In [None]:
# Confusion Matrix Generation

def ClassificationMethod(X_train, y_train,X_test,y_test,model_name:str, cv:int=5, scoring:str="accuracy"):

    if model_name=="LogisticRegression":
                name="lr"
                grid_param = {
                'penalty': ["l1","l2","elastic"],
                'C': [10**-3,10**-2,10**-1,1,10,100],
                "class_weight":[{1:1.1},{1:1.2},{1:1.2},{1:1.3},{1:1.4},{1:1.5}]
                }
                logisticClassifier=LogisticRegression()

                gd_sr = GridSearchCV(estimator=logisticClassifier,
                            param_grid=grid_param,
                            scoring=scoring,
                            cv=cv,
                            n_jobs=-1)
        
                gd_sr.fit(X_train, y_train)
                best_parameters = gd_sr.best_params_
               
                logisticClassifier= LogisticRegression(penalty=best_parameters["penalty"],C=best_parameters["C"],class_weight=best_parameters['class_weight'])
                logisticClassifier.fit(X_train, y_train)
  
                model= logisticClassifier
                
                logisticClassifier=ClassifierWithThreshold(logisticClassifier)
                threshold, optimal_tpr_minus_fpr = logisticClassifier.threshold_from_optimal_f_score(X_test, y_test)

                y_predict = logisticClassifier.predict(X_test,threshold)

               
    elif model_name=="DecisionTreeClassifier":
                name="dt"
                dTreeClassifier = DecisionTreeClassifier()
                grid_param = {
                    'max_depth': [3,5,10,15,20,30],
                    'criterion': ['gini', 'entropy'],
                     "class_weight":[{1:1.1},{1:1.2},{1:1.2},{1:1.3},{1:1.4},{1:1.5}]
                }
                gd_sr = GridSearchCV(estimator=dTreeClassifier,
                                    param_grid=grid_param,
                                    scoring=scoring,
                                    cv=cv,
                                    n_jobs=-1)

                gd_sr.fit(X_train, y_train)
                best_parameters = gd_sr.best_params_
                testTreeClassifier= DecisionTreeClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"])
                testTreeClassifier.fit(X_train, y_train)
                model=testTreeClassifier
                testTreeClassifier=ClassifierWithThreshold(testTreeClassifier)
                threshold, optimal_tpr_minus_fpr = testTreeClassifier.threshold_from_optimal_f_score(X_test, y_test)
                y_predict = testTreeClassifier.predict(X_test,threshold)
              
    elif model_name=="RandomForestClassifier":
                name="rf"
                grid_param = {
                        'max_depth': [3,5,10,15,20,30],
                        'criterion': ['gini', 'entropy'],
                        'bootstrap': [True, False],
                     "class_weight":[{1:1.1},{1:1.2},{1:1.2},{1:1.3},{1:1.4},{1:1.5}]
                }

                rforestClassifier= RandomForestClassifier()
                gd_sr = GridSearchCV(estimator=rforestClassifier,
                                            param_grid=grid_param,
                                            scoring=scoring,
                                            cv=cv,
                                            n_jobs=-1)
                gd_sr.fit(X_train, y_train)
                best_parameters = gd_sr.best_params_
        
                testforestClassifier= RandomForestClassifier(max_depth=best_parameters["max_depth"], criterion=best_parameters["criterion"], bootstrap= best_parameters["bootstrap"])
                
                testforestClassifier.fit(X_train, y_train)
                model= testforestClassifier
                testforestClassifier=ClassifierWithThreshold(testforestClassifier)
                threshold, optimal_tpr_minus_fpr = testforestClassifier.threshold_from_optimal_f_score(X_test, y_test)
                y_predict = testforestClassifier.predict(X_test,threshold)
            


    elif model_name == "SVC":

                name="svm"
                svmClassifier = SVC()
                grid_param = {
                    "C": [10**-3,10**-2,10**-1,1,10,100,100],
                    'kernel': ['linear', 'poly','rbf','sigmoid']
                }
                gd_sr = GridSearchCV(estimator=svmClassifier,
                                    param_grid=grid_param,
                                    scoring=scoring,
                                    cv=cv,
                                    n_jobs=-1)
                gd_sr.fit(X_train, y_train)
                best_parameters = gd_sr.best_params_
                
                testSVMClassifier= SVC(C=best_parameters["C"], kernel=best_parameters["kernel"],probability=True)
                testSVMClassifier.fit(X_train, y_train)
                model= testSVMClassifier

                testSVMClassifier=ClassifierWithThreshold(testSVMClassifier)
                threshold, optimal_tpr_minus_fpr = testSVMClassifier.threshold_from_optimal_f_score(X_test, y_test)
                y_predict = testSVMClassifier.predict(X_test,threshold)
            
    elif model_name=="MLP":
                from keras.layers import LeakyReLU
                name="mlp"
                model = Sequential()
                
                
                model.add(Dense(120 ,input_dim=ncols-1, activation='relu'))
                model.add(Dense(80 ,activation=LeakyReLU(alpha=0.05)))
                model.add(Dense(80 ,activation='relu'))
                model.add(Dense(40 ,activation='relu'))
                model.add(Dense(20 ,activation=LeakyReLU(alpha=0.05)))
                model.add(Dense(20 ,activation='relu'))
                model.add(Dense(1 ,activation="sigmoid"))
                callback =keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
                model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
                model.fit(X_train, y_train,epochs=20,validation_split=0.2,callbacks=[callback])


                def threshold_from_optimal_f_score2(X,y,model):
                    
                        y_pred= model.predict(X)
                        fpr, tpr, thresholds = roc_curve(y,np.array(y_pred).squeeze())
                        optimal_idx = np.argmax(tpr-fpr)
                          
                        return thresholds[optimal_idx]
                
                threshold=threshold_from_optimal_f_score2(X_test,y_test,model)
                y_pred= model.predict(X_test)
                cnt=0
                y_pred_normal= []
                for  ii in y_pred:
                    if ii>=threshold:
                        y_pred_normal.append(1)
                        cnt+=1
                    else:
                        y_pred_normal.append(0)
                y_predict=y_pred_normal
    return confusion_matrix(y_test,y_predict), model

                                
       


In [None]:
# Confusion matrix
def Classification_Report(confusion_matrix):
    test_size = sum(sum(confusion_matrix))
    accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/test_size
    precision = confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[0][1])
    recall =    confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])
    f1_score= 2*(precision*recall)/(precision+recall)   
    
    return [accuracy, precision,recall,f1_score]

In [None]:
N=10
score_table = np.zeros(shape=(N,4))
confM_table = np.zeros(shape=(N,2,2))
cwd= os.getcwd()
model_folder= cwd+'\\'+'hollink_sph_k5_d50_our_models_v2\\'
model_folder= "/auto/k2/aykut3/Basic_Level_NLP/emirhan_kod/mills_sph_k5_our_models/"

for r in range(0,N):
   rnd=random.randint(0,42)
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15,shuffle=True)
   model_name="SVC" # Select the model name
   confM,model= ClassificationMethod(X_train, y_train, X_test,y_test,model_name=model_name, cv=10, scoring="accuracy")
   print(r)              
    #########################################################################
    # Save model here, if you would like to save
   if model_name == "MLP":
          filename= model_folder+"mlp_"+ str(r+1)
          #model.save(filename)
   else:
     filename= model_folder+"rf_"+ str(r+1)+".pickle"  # change file naming according to model
     #pickle.dump(model, open(filename, 'wb'))
    #########################################################################  
   aprf=Classification_Report(confM)
   confM_table[r,:,:]=confM
   score_table[r,:]=aprf

   

In [None]:
score_table.mean(axis=0)  # average of the scores

In [None]:
score_table

In [None]:
'''
If you would like to work on the trained model, load and test them. The following script is designed to 
load weights of the pre-trained model and test on the data.
'''

# Load Model

In [29]:
# Load the data as pandas dataframe, create feature and label data before splitting train and test 
fname="hollink_sph_k5_d50_fs.csv"
data= pd.read_csv(fname,sep=" ")
ncols=len(data.columns)
X= data.iloc[:,:ncols-1]
y= data.iloc[:,-1]
X

Unnamed: 0,variances,average of distances,word_length,dim_4,dim_15,dim_29,dim_30,dim_38
0,-0.132160,1.230367,0.666691,0.130456,0.357828,0.734222,0.473118,0.151744
1,-0.140866,0.219362,-1.136010,0.363061,0.203963,0.541632,0.383208,0.221336
2,-0.648756,-1.158770,0.666691,0.025421,0.584410,0.953403,0.564589,0.359397
3,0.315375,1.397787,-1.136010,0.133144,0.623704,0.871840,0.547083,0.515146
4,-1.679824,0.131328,0.924220,0.238872,0.751556,0.437070,0.621916,0.392965
...,...,...,...,...,...,...,...,...
513,-0.004329,1.919992,-1.651067,0.295048,0.243623,0.483896,0.149568,0.578824
514,0.011744,1.747807,-1.136010,0.419581,0.397190,0.544186,0.313384,0.689214
515,1.204477,1.378808,-1.393539,0.553086,0.237050,0.183724,0.182269,0.226344
516,-1.049795,0.186689,0.151634,0.423051,0.000000,0.781844,0.275958,0.356652


In [47]:
def load_and_test(path,model_id,N,x_test,Y_test,mode=None):
    Score_table = np.zeros(shape=(N,4))
    ConfM_table = np.zeros(shape=(N,2,2))
    y_pred_avg=0
    for r in range(N):
            
            if mode=="mlp":
                model_path=path+model_id+str(r+1)
                pickled_model = keras.models.load_model(model_path)
                def threshold_from_optimal_f_score2(X,y,model):
                        
                        y_pred= model.predict(X)
                        fpr, tpr, thresholds = roc_curve(y,np.array(y_pred).squeeze())

                        optimal_idx = np.argmax(tpr-fpr)
                        
                        return thresholds[optimal_idx], tpr[optimal_idx] - fpr[optimal_idx]
               

                
                threshold,_=threshold_from_optimal_f_score2(X_test,y_test,pickled_model)
                y_pred= pickled_model.predict(X_test)
                y_pred_avg=y_pred+y_pred_avg
                fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)

                 

                        
                cnt=0
                y_pred_normal= []
                for  ii in y_pred:
                    if ii>=threshold:
                        y_pred_normal.append(1)
                        cnt+=1
                    else:
                        y_pred_normal.append(0)
                y_predict=y_pred_normal
                confM=confusion_matrix(Y_test,y_predict)
                aprf=Classification_Report(confM)
                ConfM_table[r,:,:]=confM
                Score_table[r,:]=aprf    
                    
            else:
                
                
                model=path+model_id+str(r+1)+".pickle"
                pickled_model = pickle.load(open(model, 'rb'))
                pickled_model =ClassifierWithThreshold(pickled_model )
                threshold, optimal_tpr_minus_fpr = pickled_model .threshold_from_optimal_f_score(X_test, y_test)
                y_predict = pickled_model .predict(X_test,threshold)
                
                
                
                '''
                model=path+model_id+str(r+1)+".pickle"
                pickled_model = pickle.load(open(model, 'rb'))
                y_predict=pickled_model.predict(X_test)
                '''
                confM=confusion_matrix(Y_test,y_predict)
                aprf=Classification_Report(confM)
                ConfM_table[r,:,:]=confM
                Score_table[r,:]=aprf
    return aprf, ConfM_table, Score_table,y_pred_avg/N


In [53]:

cwd= os.getcwd()
model_folder= cwd+"/hollink_sph_k5_d50_our_models_v2/"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15,shuffle=True)
aprf,confM_table,score_table,y_pred_avg =load_and_test(path=model_folder,x_test=X_test,Y_test=y_test,model_id="lr_",N=10,mode="svm")

In [54]:
score_table.mean(axis=0)

array([0.75      , 0.6       , 0.89189189, 0.7173913 ])