In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets

In [6]:
class DummyEstimator(BaseEstimator):
        def fit(self): pass
        def score(self): pass

In [7]:
class ModelSelector:

    def __init__(self, model_params, x_train, y_train):
        self.pipeline = Pipeline([('clf', DummyEstimator())])
        self.model_params = model_params
        self.x_train = x_train
        self.y_train = y_train
        self.score_summary
        
    def score_summary(self):
        gs = GridSearchCV(self.pipeline, self.model_params, n_jobs=5)
        gs.fit(self.x_train, self.y_train)
        frame = pd.DataFrame(gs.cv_results_)
        frame = frame.sort_values(['mean_test_score'], ascending=False)
        self.score_summary = frame
        return frame

In [18]:
class SupervisedModel:

    def __init__(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
        
        self.clf = None
        self.accuracy = None

    def getBestModel(self, model:str = None):
    
        if model is None:
        
            model_param = [
                {'clf': [LogisticRegression(max_iter=1000)], 
                 'clf__penalty': ['l2'],
                 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear'],
                 'clf__C': [10, 1.0, 0.1]},

                {'clf': [KNeighborsClassifier()], 
                 'clf__n_neighbors': [5,10,15,20,25],
                 'clf__metric': ['euclidean', 'manhattan', 'minkowski']
                },

                {'clf': [RandomForestClassifier()], 
                'clf__n_estimators': [10, 50, 100, 250, 500],
                'clf__max_depth': [2,5,10,12, None],
                'clf__max_samples': [0.2,0.4,0.6, None]
                },

                {'clf': [GaussianNB()], 
                 'clf__var_smoothing': np.logspace(0,-9, num=4)},

                {'clf': [SVC()], 
                 'clf__kernel': ['linear','poly','rbf'],
                 'clf__C': [10**-1, 10**1]
                }
            ]
                
    
        if model == 'logistic_regression':
        
            model_param = {'clf': [LogisticRegression(max_iter=1000)], 
                           'clf__penalty': ['l2'],
                           'clf__C': [1,0.5,10]
                           }
                           
        if model == 'random_forest':
            
            model_param = {'clf': [RandomForestClassifier()], 
                           'clf__n_estimators': [10, 50, 100, 250],
                           'clf__max_depth': [2,5,10,12, None],
                           'clf__max_samples': [0.2,0.4,0.6, None]
                           }
                           
        if model == 'knn':
        
            model_param = {'clf': [KNeighborsClassifier()], 
                           'clf__n_neighbors': [5,10,15,20,25]
                           }
                          
        if model == 'gaussian_nb':
        
            model_param = {'clf': [GaussianNB()], 
                           'clf__var_smoothing': [0, -3]
                           }
        
        if model == 'svm':
        
            model_param = {'clf': [SVC()], 
                           'clf__kernel': ['rbf'],
                           'clf__C': [10**-1, 10**1]
                           }
                           
        
        modelSelector = ModelSelector(model_param, self.x_train, self.y_train)
        return modelSelector.score_summary()

In [9]:
with open("../features/oversampled_count_vectorizer.pkl",'rb') as f:
    cv = pickle.load(f)
cv_y = cv['categories']
cv.drop(columns=['categories'], inplace=True)

In [10]:
supervisedModel = SupervisedModel(cv, cv_y)


In [11]:
print("Working on Logistic Regression.....")
result_lr = supervisedModel.getBestModel(model='logistic_regression')
result_lr.to_csv("../results/supervised/cv_logistic_regression.csv", index=False)
result_lr

Working on Logistic Regression.....


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_clf__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,559.344515,3.004255,0.408443,0.155621,"LogisticRegression(C=10, max_iter=1000)",10.0,l2,"{'clf': LogisticRegression(C=10, max_iter=1000...",0.842464,0.857536,0.849352,0.562201,0.68401,0.759113,0.117637,1
0,529.528347,27.331125,1.258396,0.115615,"LogisticRegression(C=10, max_iter=1000)",1.0,l2,"{'clf': LogisticRegression(C=10, max_iter=1000...",0.85316,0.868395,0.858833,0.549801,0.641219,0.754282,0.13291,2
1,394.272959,16.384409,0.739185,0.138971,"LogisticRegression(C=10, max_iter=1000)",0.5,l2,"{'clf': LogisticRegression(C=10, max_iter=1000...",0.859481,0.871394,0.862966,0.541049,0.622498,0.751478,0.14099,3


In [12]:
print("Working on Random Forest.....")
result_rf = supervisedModel.getBestModel(model='random_forest')
result_rf.to_csv("../results/supervised/cv_random_forest.csv", index=False)
result_rf

Working on Random Forest.....


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__max_depth,param_clf__max_samples,param_clf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
79,207.330096,3.556949,3.795626,0.908951,RandomForestClassifier(n_estimators=250),,,250,{'clf': RandomForestClassifier(n_estimators=25...,0.846272,0.873825,0.863614,0.588540,0.570224,0.748495,0.138482,1
78,86.418430,1.988623,1.618516,0.172441,RandomForestClassifier(n_estimators=250),,,100,{'clf': RandomForestClassifier(n_estimators=25...,0.842788,0.873501,0.862075,0.589189,0.557176,0.744946,0.140951,2
77,43.991679,1.152513,0.795719,0.048425,RandomForestClassifier(n_estimators=250),,,50,{'clf': RandomForestClassifier(n_estimators=25...,0.833306,0.863776,0.855592,0.586433,0.561310,0.740083,0.136309,3
74,61.020923,1.266365,1.359138,0.168728,RandomForestClassifier(n_estimators=250),,0.6,100,{'clf': RandomForestClassifier(n_estimators=25...,0.834198,0.863209,0.857699,0.567226,0.533187,0.731104,0.148414,4
75,152.447080,3.827303,4.187943,0.441656,RandomForestClassifier(n_estimators=250),,0.6,250,{'clf': RandomForestClassifier(n_estimators=25...,0.837358,0.868314,0.858995,0.566902,0.520626,0.730439,0.153450,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2.203373,0.040429,0.153955,0.006290,RandomForestClassifier(n_estimators=250),2,,50,{'clf': RandomForestClassifier(n_estimators=25...,0.364506,0.499595,0.421070,0.195073,0.000000,0.296049,0.178655,76
8,1.295472,0.029348,0.110945,0.005817,RandomForestClassifier(n_estimators=250),2,0.6,10,{'clf': RandomForestClassifier(n_estimators=25...,0.402269,0.475284,0.427553,0.160062,0.000081,0.293050,0.182617,77
12,1.294597,0.021864,0.103542,0.002247,RandomForestClassifier(n_estimators=250),2,,10,{'clf': RandomForestClassifier(n_estimators=25...,0.375284,0.379173,0.485575,0.183807,0.000567,0.284881,0.172274,78
4,1.236327,0.023860,0.099981,0.006394,RandomForestClassifier(n_estimators=250),2,0.4,10,{'clf': RandomForestClassifier(n_estimators=25...,0.342382,0.467990,0.330389,0.146932,0.002026,0.257944,0.163899,79


In [13]:
print("Working on KNN.....")
result_knn = supervisedModel.getBestModel(model='knn')
result_knn.to_csv("../results/supervised/cv_knn.csv", index=False)
result_knn

Working on KNN.....


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.926054,0.009145,63.602984,0.459177,KNeighborsClassifier(),5,"{'clf': KNeighborsClassifier(), 'clf__n_neighb...",0.498622,0.518639,0.548703,0.264446,0.285031,0.423088,0.122345,1
1,0.972702,0.107058,44.019371,2.803238,KNeighborsClassifier(),10,"{'clf': KNeighborsClassifier(), 'clf__n_neighb...",0.463857,0.491572,0.523582,0.230002,0.222546,0.386312,0.132051,2
2,1.247784,0.114736,41.41844,2.81973,KNeighborsClassifier(),15,"{'clf': KNeighborsClassifier(), 'clf__n_neighb...",0.437763,0.470259,0.50624,0.215982,0.204717,0.366992,0.129769,3
3,1.286967,0.301525,49.182856,3.938136,KNeighborsClassifier(),20,"{'clf': KNeighborsClassifier(), 'clf__n_neighb...",0.424635,0.459076,0.497002,0.210957,0.185509,0.355436,0.130629,4
4,3.662989,1.230794,178.055218,6.517874,KNeighborsClassifier(),25,"{'clf': KNeighborsClassifier(), 'clf__n_neighb...",0.406726,0.447974,0.486548,0.201313,0.171003,0.342713,0.130648,5


In [14]:
print("Working on Gaussian NB.....")
result_nb = supervisedModel.getBestModel(model='gaussian_nb')
result_nb.to_csv("../results/supervised/cv_gaussian_nb.csv", index=False)
result_nb

Working on Gaussian NB.....


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,9.348581,0.497576,5.782762,1.171772,GaussianNB(var_smoothing=0),0,"{'clf': GaussianNB(var_smoothing=0), 'clf__var...",0.200324,0.194165,0.144652,0.175136,0.0,0.142855,0.074011,1
1,7.270507,0.309858,2.416506,0.190454,GaussianNB(var_smoothing=0),-3,"{'clf': GaussianNB(var_smoothing=0), 'clf__var...",0.200324,0.194165,0.144652,0.175136,0.0,0.142855,0.074011,1


In [None]:
print("Working on SVM.....")
result_svm = supervisedModel.getBestModel(model='svm')
result_svm.to_csv("../results/supervised/cv_svm.csv", index=False)
result_svm

Working on SVM.....
