In [112]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

In [113]:
# Data sources list 
folder = ['Amazon','FlipKart','Combine','Walmart']
class SamplingApproach:
    def __init__(self,num):
        self.num = num
    # initialise the train and test data
    def openFile(self):
        num = self.num
        print("----- OPEN "+folder[num]+" data ----- \n")
        trainData = pd.read_csv(folder[num]+'/X_train.csv')
        trainLabel = pd.read_csv(folder[num]+'/y_train.csv')
        testData = pd.read_csv(folder[num]+'/X_test.csv')
        testLabel = pd.read_csv(folder[num]+'/y_test.csv')
        if(num==0 or num==2):
            # For Description has nan row
            df = pd.concat([trainData,trainLabel], axis = 1)
            df = df.dropna(subset=['X_train'])
            trainData = pd.DataFrame({'X_train':df.X_train})
            trainLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
            df = pd.concat([testData,testLabel], axis = 1)
            df = df.dropna(subset=['X_test'])
            testData = pd.DataFrame({'X_test':df.X_test})
            testLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
        X_train = trainData['X_train']
        y_train = trainLabel['category']
        X_test = testData['X_test']
        y_test = testLabel['category']
        return X_train,y_train,X_test,y_test

    def tfidfConvert(self,Xtrain, Xtest):
        tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
        print("----- Convert train data and test data to vector ----- \n")
        X = tfidfconverter.fit_transform(Xtrain)
        XTest = tfidfconverter.transform(Xtest)
        originTrainFeatures = X
        originTestFeatures = XTest
        print("----- Total # features: "+str(X.shape[1])+" ----- \n")
        total_feature = X.shape[1]
        return originTrainFeatures,originTestFeatures,total_feature
    def selectBestfeatureViaChi2(self,Xtrain, Xtest,yTrain,num):
        print("----- Select Best "+str(num)+" features ----- \n")
        selectBest = SelectKBest(chi2, k=num).fit(Xtrain, yTrain)
        select_feature = selectBest.transform(Xtrain)
        test_features = selectBest.transform(Xtest)
        return select_feature, test_features
    def ros (self, Xtrain, Ytrain):
        ros = RandomOverSampler(random_state=42)
        X_resampled, y_resampled = ros.fit_resample(Xtrain, Ytrain)
        return X_resampled, y_resampled
    def SMOTE (self,Xtrain, Ytrain):
        smote = SMOTE(random_state=42, sampling_strategy= 'minority',k_neighbors=5)
        X_resampled, y_resampled = smote.fit_resample(Xtrain, Ytrain)
        return X_resampled,y_resampled
    def MNBC(self, Xtrain, Xtest, Ytrain, Ytest,num):
        select_feature, test_features = self.selectBestfeatureViaChi2(Xtrain, Xtest,Ytrain,num)
        MNB = ComplementNB()
        print("----- CNBC fitting -----")
        MNB.fit(select_feature,Ytrain)
        main_category = np.unique(Ytrain)
        score = MNB.score(test_features,Ytest)
        y_pred =MNB.predict(test_features)
        print(classification_report(Ytest, y_pred,labels=main_category))
        return score

    def SVC(self,  Xtrain, Xtest, Ytrain, Ytest,num):
        select_feature, test_features = self.selectBestfeatureViaChi2(Xtrain, Xtest,Ytrain,num)
        svc = LinearSVC(random_state=42,class_weight="balanced")
        print("----- SVC fitting -----")
        svc.fit(select_feature,Ytrain)
        main_category = np.unique(Ytrain)
        score = svc.score(test_features,Ytest)
        y_pred =svc.predict(test_features)
        print(classification_report(Ytest, y_pred,labels=main_category))
        return score

In [114]:
sa = SamplingApproach(0)

In [115]:
X_train,y_train,X_test,y_test = sa.openFile()

----- OPEN Amazon data ----- 



In [116]:
originTrainFeatures,originTestFeatures,total_feature = sa.tfidfConvert(X_train,X_test)

----- Convert train data and test data to vector ----- 

----- Total # features: 5927 ----- 



In [117]:
sa.MNBC(originTrainFeatures,originTestFeatures,y_train,y_test,5000)

----- Select Best 5000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       143
           1       1.00      0.50      0.67         4
           2       0.81      0.51      0.63       181
           3       0.80      0.82      0.81       233
           4       0.73      0.87      0.79        46
           5       0.87      0.88      0.88       113
           6       0.73      0.81      0.77       208
           7       0.80      0.87      0.83       165
           8       0.78      0.74      0.76       260
           9       0.66      0.78      0.72        50
          10       0.85      0.91      0.88       109
          11       0.67      0.20      0.31        10
          12       0.86      0.98      0.91        44
          13       0.84      0.76      0.80        67

   micro avg       0.79      0.79      0.79      1633
   macro avg       0.80      0.75      0.76      1633
weighted avg   

0.7930189834660135

In [118]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,5000)

----- Select Best 5000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       143
           1       0.13      1.00      0.24         4
           2       0.78      0.48      0.59       181
           3       0.79      0.81      0.80       233
           4       0.63      0.83      0.72        46
           5       0.81      0.85      0.83       113
           6       0.74      0.71      0.73       208
           7       0.81      0.82      0.82       165
           8       0.81      0.66      0.73       260
           9       0.57      0.86      0.68        50
          10       0.85      0.91      0.88       109
          11       0.29      0.80      0.42        10
          12       0.81      0.98      0.89        44
          13       0.68      0.79      0.73        67

   micro avg       0.76      0.76      0.76      1633
   macro avg       0.68      0.81      0.71      1633
weighted avg   

0.7562767911818739

In [119]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,5000)

----- Select Best 5000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       143
           1       0.08      1.00      0.14         4
           2       0.81      0.51      0.63       181
           3       0.80      0.80      0.80       233
           4       0.82      0.89      0.85        46
           5       0.88      0.88      0.88       113
           6       0.73      0.79      0.76       208
           7       0.82      0.84      0.83       165
           8       0.79      0.76      0.77       260
           9       0.73      0.70      0.71        50
          10       0.84      0.91      0.87       109
          11       0.50      0.10      0.17        10
          12       0.90      0.98      0.93        44
          13       0.83      0.73      0.78        67

   micro avg       0.78      0.78      0.78      1633
   macro avg       0.74      0.77      0.71      1633
weighted avg   

0.7783221065523577

In [120]:
sa.SVC(originTrainFeatures,originTestFeatures,y_train,y_test,3000)

----- Select Best 3000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       143
           1       0.43      0.75      0.55         4
           2       0.76      0.65      0.70       181
           3       0.85      0.87      0.86       233
           4       0.79      0.91      0.85        46
           5       0.86      0.88      0.87       113
           6       0.75      0.81      0.78       208
           7       0.82      0.84      0.83       165
           8       0.78      0.76      0.77       260
           9       0.77      0.80      0.78        50
          10       0.93      0.94      0.94       109
          11       0.62      0.50      0.56        10
          12       0.93      0.98      0.96        44
          13       0.82      0.73      0.77        67

   micro avg       0.82      0.82      0.82      1633
   macro avg       0.79      0.81      0.79      1633
weighted avg    

0.8162890385793019

In [121]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,3000)

----- Select Best 3000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.87      0.88      0.87       143
           1       0.50      0.75      0.60         4
           2       0.74      0.66      0.70       181
           3       0.83      0.85      0.84       233
           4       0.84      0.89      0.86        46
           5       0.85      0.86      0.85       113
           6       0.76      0.80      0.78       208
           7       0.80      0.84      0.82       165
           8       0.76      0.76      0.76       260
           9       0.80      0.80      0.80        50
          10       0.90      0.92      0.91       109
          11       0.67      0.60      0.63        10
          12       0.95      0.93      0.94        44
          13       0.81      0.72      0.76        67

   micro avg       0.81      0.81      0.81      1633
   macro avg       0.79      0.80      0.80      1633
weighted avg    

0.8089406001224739

In [122]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,3000)

----- Select Best 3000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       143
           1       0.43      0.75      0.55         4
           2       0.75      0.65      0.69       181
           3       0.85      0.86      0.85       233
           4       0.78      0.91      0.84        46
           5       0.86      0.89      0.87       113
           6       0.76      0.80      0.78       208
           7       0.82      0.83      0.82       165
           8       0.78      0.77      0.78       260
           9       0.78      0.80      0.79        50
          10       0.93      0.94      0.93       109
          11       0.67      0.60      0.63        10
          12       0.93      0.98      0.96        44
          13       0.82      0.73      0.77        67

   micro avg       0.82      0.82      0.82      1633
   macro avg       0.79      0.81      0.80      1633
weighted avg    

0.8150642988364972

In [123]:
sa = SamplingApproach(1)

In [124]:
X_train,y_train,X_test,y_test = sa.openFile()

----- OPEN FlipKart data ----- 



In [125]:
originTrainFeatures,originTestFeatures,total_feature = sa.tfidfConvert(X_train,X_test)

----- Convert train data and test data to vector ----- 

----- Total # features: 4742 ----- 



In [126]:
sa.MNBC(originTrainFeatures,originTestFeatures,y_train,y_test,4000)

----- Select Best 4000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       163
           1       1.00      0.59      0.74        82
           2       1.00      0.94      0.97        35
           3       0.98      0.92      0.95        95
           4       1.00      1.00      1.00        13
           5       0.97      1.00      0.98      1231
           6       0.97      0.89      0.93       105
           7       1.00      1.00      1.00       208
           8       1.00      0.91      0.95        11
           9       0.99      0.97      0.98       117
          10       0.99      0.99      0.99        84
          11       0.97      1.00      0.98       487
          12       1.00      0.96      0.98       107
          13       0.95      0.99      0.97       197
          14       0.93      0.50      0.65        28
          15       1.00      0.99      0.99        67
          16    

0.9706262104583603

In [127]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,4000)

----- Select Best 4000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       163
           1       0.54      0.96      0.70        82
           2       0.92      1.00      0.96        35
           3       0.96      0.98      0.97        95
           4       0.76      1.00      0.87        13
           5       1.00      0.92      0.96      1231
           6       0.96      0.90      0.93       105
           7       0.98      1.00      0.99       208
           8       1.00      1.00      1.00        11
           9       0.96      0.96      0.96       117
          10       0.94      0.99      0.97        84
          11       1.00      1.00      1.00       487
          12       0.93      0.98      0.95       107
          13       0.94      0.98      0.96       197
          14       0.67      0.64      0.65        28
          15       0.99      0.99      0.99        67
          16    

0.9528728211749515

In [128]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,4000)

----- Select Best 4000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       163
           1       1.00      0.51      0.68        82
           2       1.00      0.91      0.96        35
           3       0.99      0.85      0.92        95
           4       0.42      1.00      0.59        13
           5       0.96      1.00      0.98      1231
           6       0.98      0.89      0.93       105
           7       1.00      0.99      1.00       208
           8       1.00      0.91      0.95        11
           9       0.99      0.96      0.97       117
          10       0.99      0.99      0.99        84
          11       0.98      1.00      0.99       487
          12       1.00      0.96      0.98       107
          13       0.95      0.99      0.97       197
          14       0.93      0.50      0.65        28
          15       1.00      0.99      0.99        67
          16    

0.9651387992253067

In [129]:
sa.SVC(originTrainFeatures,originTestFeatures,y_train,y_test,4000)

----- Select Best 4000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       163
           1       0.94      0.96      0.95        82
           2       1.00      0.97      0.99        35
           3       1.00      1.00      1.00        95
           4       1.00      1.00      1.00        13
           5       1.00      1.00      1.00      1231
           6       0.99      0.96      0.98       105
           7       1.00      1.00      1.00       208
           8       1.00      0.91      0.95        11
           9       0.99      1.00      1.00       117
          10       1.00      0.99      0.99        84
          11       1.00      1.00      1.00       487
          12       1.00      1.00      1.00       107
          13       0.98      0.99      0.99       197
          14       0.78      0.75      0.76        28
          15       1.00      1.00      1.00        67
          16     

0.9903163331181407

In [130]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,4000)

----- Select Best 4000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       163
           1       0.94      0.96      0.95        82
           2       1.00      0.97      0.99        35
           3       1.00      1.00      1.00        95
           4       1.00      1.00      1.00        13
           5       1.00      1.00      1.00      1231
           6       0.99      0.96      0.98       105
           7       1.00      1.00      1.00       208
           8       1.00      0.91      0.95        11
           9       0.98      1.00      0.99       117
          10       1.00      0.99      0.99        84
          11       1.00      1.00      1.00       487
          12       1.00      1.00      1.00       107
          13       0.98      0.99      0.99       197
          14       0.76      0.89      0.82        28
          15       1.00      1.00      1.00        67
          16     

0.9909619109102646

In [131]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,4000)

----- Select Best 4000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       163
           1       0.94      0.96      0.95        82
           2       1.00      0.97      0.99        35
           3       1.00      1.00      1.00        95
           4       1.00      1.00      1.00        13
           5       1.00      1.00      1.00      1231
           6       0.99      0.96      0.98       105
           7       1.00      1.00      1.00       208
           8       1.00      0.91      0.95        11
           9       0.99      1.00      1.00       117
          10       1.00      0.99      0.99        84
          11       1.00      1.00      1.00       487
          12       1.00      1.00      1.00       107
          13       0.98      0.99      0.99       197
          14       0.78      0.75      0.76        28
          15       1.00      1.00      1.00        67
          16     

0.9903163331181407

In [132]:
sa = SamplingApproach(2)

In [133]:
X_train,y_train,X_test,y_test = sa.openFile()

----- OPEN Combine data ----- 



In [134]:
originTrainFeatures,originTestFeatures,total_feature = sa.tfidfConvert(X_train,X_test)

----- Convert train data and test data to vector ----- 

----- Total # features: 9848 ----- 



In [135]:
sa.MNBC(originTrainFeatures,originTestFeatures,y_train,y_test,6000)

----- Select Best 6000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       191
           1       0.91      0.45      0.60        92
           2       0.96      0.48      0.64        94
           3       0.93      0.83      0.88       145
           4       0.94      0.98      0.96       346
           5       0.97      0.97      0.97      2536
           6       0.95      0.95      0.95       613
           7       0.91      0.97      0.94      1635
           8       0.97      0.73      0.84        52

   micro avg       0.95      0.95      0.95      5704
   macro avg       0.94      0.81      0.86      5704
weighted avg       0.95      0.95      0.94      5704



0.947054698457223

In [136]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,6000)

----- Select Best 6000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       191
           1       0.47      0.86      0.61        92
           2       0.36      0.88      0.51        94
           3       0.73      0.96      0.83       145
           4       0.88      0.98      0.93       346
           5       0.99      0.92      0.95      2536
           6       0.96      0.90      0.93       613
           7       0.97      0.87      0.92      1635
           8       0.41      0.98      0.58        52

   micro avg       0.91      0.91      0.91      5704
   macro avg       0.74      0.92      0.80      5704
weighted avg       0.94      0.91      0.92      5704



0.9079593267882188

In [137]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,6000)

----- Select Best 6000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       191
           1       0.87      0.45      0.59        92
           2       0.96      0.48      0.64        94
           3       0.95      0.75      0.84       145
           4       0.94      0.97      0.96       346
           5       0.96      0.98      0.97      2536
           6       0.96      0.93      0.94       613
           7       0.93      0.95      0.94      1635
           8       0.57      0.98      0.72        52

   micro avg       0.94      0.94      0.94      5704
   macro avg       0.90      0.82      0.83      5704
weighted avg       0.94      0.94      0.94      5704



0.9412692847124825

In [138]:
sa.SVC(originTrainFeatures,originTestFeatures,y_train,y_test,6000)

----- Select Best 6000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       191
           1       0.82      0.76      0.79        92
           2       0.88      0.86      0.87        94
           3       0.97      0.96      0.96       145
           4       0.98      0.99      0.98       346
           5       0.99      0.99      0.99      2536
           6       0.98      0.98      0.98       613
           7       0.97      0.98      0.97      1635
           8       0.98      0.94      0.96        52

   micro avg       0.98      0.98      0.98      5704
   macro avg       0.95      0.94      0.94      5704
weighted avg       0.98      0.98      0.98      5704



0.978085553997195

In [139]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,6000)

----- Select Best 6000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       191
           1       0.82      0.74      0.78        92
           2       0.84      0.89      0.87        94
           3       0.96      0.96      0.96       145
           4       0.98      0.99      0.98       346
           5       0.99      0.99      0.99      2536
           6       0.97      0.98      0.97       613
           7       0.97      0.98      0.97      1635
           8       0.91      0.92      0.91        52

   micro avg       0.98      0.98      0.98      5704
   macro avg       0.94      0.94      0.94      5704
weighted avg       0.98      0.98      0.98      5704



0.9761570827489481

In [140]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,6000)

----- Select Best 6000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       191
           1       0.83      0.75      0.79        92
           2       0.90      0.87      0.89        94
           3       0.97      0.96      0.96       145
           4       0.98      0.99      0.98       346
           5       0.99      0.99      0.99      2536
           6       0.97      0.98      0.98       613
           7       0.97      0.97      0.97      1635
           8       0.72      0.98      0.83        52

   micro avg       0.98      0.98      0.98      5704
   macro avg       0.92      0.94      0.93      5704
weighted avg       0.98      0.98      0.98      5704



0.9756311360448808

In [141]:
sa = SamplingApproach(3)

In [142]:
X_train,y_train,X_test,y_test = sa.openFile()

----- OPEN Walmart data ----- 



In [143]:
originTrainFeatures,originTestFeatures,total_feature = sa.tfidfConvert(X_train,X_test)

----- Convert train data and test data to vector ----- 

----- Total # features: 24650 ----- 



In [144]:
sa.MNBC(originTrainFeatures,originTestFeatures,y_train,y_test,24000)

----- Select Best 24000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.46      0.79      0.58       114
           1       0.81      0.55      0.66       277
           2       0.86      0.97      0.91      2404
           3       0.88      0.88      0.88      2047
           4       0.72      0.87      0.79      1031
           5       0.87      0.93      0.90      5988
           6       0.91      0.58      0.71        36
           7       1.00      0.08      0.15        50
           8       0.79      0.43      0.56       573
           9       0.83      0.41      0.55       362
          10       0.76      0.34      0.47        38
          11       0.77      0.20      0.32       239
          12       0.88      0.96      0.92      4572
          13       0.80      0.62      0.70       129
          14       0.00      0.00      0.00        10
          15       0.80      0.87      0.83      1567
          16   

  'precision', 'predicted', average, warn_for)


0.8417713434106877

In [145]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,24000)

----- Select Best 24000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.20      0.92      0.33       114
           1       0.41      0.71      0.52       277
           2       0.93      0.95      0.94      2404
           3       0.92      0.73      0.81      2047
           4       0.83      0.69      0.76      1031
           5       0.96      0.64      0.77      5988
           6       0.24      0.75      0.36        36
           7       0.28      0.36      0.32        50
           8       0.56      0.62      0.59       573
           9       0.58      0.62      0.60       362
          10       0.11      0.84      0.19        38
          11       0.19      0.90      0.31       239
          12       0.96      0.71      0.82      4572
          13       0.42      0.90      0.57       129
          14       0.02      0.60      0.04        10
          15       0.89      0.73      0.80      1567
          16   

0.7179050457738982

In [146]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.MNBC(X_resampled,originTestFeatures,y_resampled,y_test,24000)

----- Select Best 24000 features ----- 

----- CNBC fitting -----
              precision    recall  f1-score   support

           0       0.46      0.77      0.58       114
           1       0.81      0.44      0.57       277
           2       0.87      0.97      0.92      2404
           3       0.89      0.85      0.87      2047
           4       0.72      0.86      0.78      1031
           5       0.84      0.94      0.89      5988
           6       0.95      0.58      0.72        36
           7       1.00      0.08      0.15        50
           8       0.80      0.42      0.55       573
           9       0.86      0.38      0.52       362
          10       0.69      0.24      0.35        38
          11       0.83      0.16      0.27       239
          12       0.89      0.91      0.90      4572
          13       0.85      0.61      0.71       129
          14       0.01      0.50      0.02        10
          15       0.80      0.87      0.83      1567
          16   

  'precision', 'predicted', average, warn_for)


0.8232488822652757

In [147]:
sa.SVC(originTrainFeatures,originTestFeatures,y_train,y_test,24000)

----- Select Best 24000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       114
           1       0.71      0.76      0.73       277
           2       0.97      0.98      0.97      2404
           3       0.90      0.95      0.92      2047
           4       0.86      0.89      0.88      1031
           5       0.96      0.91      0.93      5988
           6       0.80      0.89      0.84        36
           7       0.34      0.42      0.38        50
           8       0.71      0.74      0.72       573
           9       0.68      0.73      0.71       362
          10       0.79      0.71      0.75        38
          11       0.62      0.77      0.69       239
          12       0.96      0.95      0.95      4572
          13       0.82      0.88      0.85       129
          14       0.67      0.20      0.31        10
          15       0.90      0.87      0.89      1567
          16    



0.8930806898020013

In [148]:
X_resampled, y_resampled = sa.ros(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,24000)

----- Select Best 24000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       114
           1       0.65      0.75      0.69       277
           2       0.98      0.97      0.97      2404
           3       0.92      0.87      0.90      2047
           4       0.84      0.88      0.86      1031
           5       0.95      0.89      0.92      5988
           6       0.84      0.86      0.85        36
           7       0.13      0.58      0.21        50
           8       0.67      0.73      0.70       573
           9       0.65      0.70      0.68       362
          10       0.50      0.66      0.57        38
          11       0.51      0.83      0.63       239
          12       0.95      0.93      0.94      4572
          13       0.80      0.86      0.83       129
          14       0.50      0.30      0.37        10
          15       0.87      0.86      0.87      1567
          16    

0.870300191611667

In [149]:
X_resampled, y_resampled = sa.SMOTE(originTrainFeatures,y_train)
sa.SVC(X_resampled,originTestFeatures,y_resampled,y_test,24000)

----- Select Best 24000 features ----- 

----- SVC fitting -----
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       114
           1       0.71      0.75      0.73       277
           2       0.97      0.97      0.97      2404
           3       0.90      0.95      0.92      2047
           4       0.86      0.89      0.88      1031
           5       0.96      0.91      0.93      5988
           6       0.80      0.89      0.84        36
           7       0.36      0.42      0.39        50
           8       0.71      0.73      0.72       573
           9       0.68      0.73      0.71       362
          10       0.79      0.71      0.75        38
          11       0.63      0.76      0.69       239
          12       0.96      0.95      0.95      4572
          13       0.82      0.87      0.85       129
          14       0.67      0.40      0.50        10
          15       0.90      0.88      0.89      1567
          16    



0.8936342346178412