In [1]:
import os
from math import log, ceil
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest,chi2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import ComplementNB
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier,OutputCodeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
def plotDifferentNumFeatureInfluence(numlists,NBCscores,OVRSVC, OVOSVC):
    fig, ax = plt.subplots()
    ax.plot(numlists, NBCscores,color='tab:blue',label='CNBC')
    ax.plot(numlists, OVRSVC,color='tab:orange',label='OvRSVC')
    ax.plot(numlists ,OVOSVC,color='tab:red',label='OVOSVC')
    ax.set(xlabel='Number of classes', ylabel='Accuracy')
    ax.legend()
    title= 'Accuracy compare with different number of classes in different approach'
    ax.set_title(title)
    ax.grid()
    fig.savefig("Classnum.png")

In [3]:
# Choose folder
folder = ['Amazon','FlipKart','Combine','Walmart']
class OpenData:
    def __init__(self,num):
        self.num = num
    def openFile(self):
        num = self.num
        trainData = pd.read_csv(folder[num]+'/X_train.csv')
        trainLabel = pd.read_csv(folder[num]+'/y_train.csv')
        testData = pd.read_csv(folder[num]+'/X_test.csv')
        testLabel = pd.read_csv(folder[num]+'/y_test.csv')
        if(num==0 or num==2):
            # For Description has nan row
            df = pd.concat([trainData,trainLabel], axis = 1)
            df = df.dropna(subset=['X_train'])
            trainData = pd.DataFrame({'X_train':df.X_train})
            trainLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
            df = pd.concat([testData,testLabel], axis = 1)
            df = df.dropna(subset=['X_test'])
            testData = pd.DataFrame({'X_test':df.X_test})
            testLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
        return trainData,trainLabel,testData,testLabel
            

In [4]:
od = OpenData((3))
trainData,trainLabel,testData,testLabel = od.openFile()

In [5]:
df_train = pd.DataFrame({'X_train':trainData['X_train'],'y_train':trainLabel['category']})
df_test = pd.DataFrame({'X_test':testData['X_test'],'y_test':testLabel['category']})

In [6]:
df_train.y_train.value_counts(),df_test.y_test.value_counts()

(5     24152
 12    18128
 2      9376
 3      8362
 15     6098
 4      4362
 25     4002
 22     3460
 19     2665
 8      2235
 9      1575
 21     1118
 1       992
 11      968
 23      853
 24      820
 16      764
 18      700
 13      539
 0       518
 26      461
 17      456
 31      277
 29      275
 10      173
 7       162
 6       156
 20      127
 30       58
 27       46
 14       30
 28       30
 Name: y_train, dtype: int64, 5     5988
 12    4572
 2     2404
 3     2047
 15    1567
 4     1031
 25     999
 22     814
 19     669
 8      573
 9      362
 21     282
 1      277
 11     239
 24     222
 23     207
 18     193
 16     190
 26     152
 13     129
 17     123
 0      114
 29      70
 31      69
 7       50
 10      38
 6       36
 20      30
 27      13
 30      11
 14      10
 28       4
 Name: y_test, dtype: int64)

In [7]:
# Origin 
OuputNum = len(df_train.y_train.unique())
print("----- Origin Class Num " + str(OuputNum) + '\n')
df_train_nd = df_train[(df_train.y_train!=5) & (df_train.y_train!=28)& (df_train.y_train!=12) & (df_train.y_train!=14)]
df_test_nd = df_test[(df_test.y_test!=5) & (df_test.y_test!=28) & (df_test.y_test!=12) & (df_test.y_test!=14)]
OuputNumnd = len(df_train_nd.y_train.unique())
print("----- Second Class Num " + str(OuputNumnd) + '\n')
df_train_rd = df_train_nd[(df_train_nd.y_train!=30) & (df_train_nd.y_train!=27)& (df_train_nd.y_train!=2) & (df_train_nd.y_train!=3)]
df_test_rd = df_test_nd[(df_test_nd.y_test!=30) & (df_test_nd.y_test!=27) & (df_test_nd.y_test!=2) & (df_test_nd.y_test!=3)]
OuputNumrd = len(df_train_rd.y_train.unique())
print("----- Third Class Num " + str(OuputNumrd) + '\n')
df_train_or = df_train_rd[(df_train_rd.y_train!=20) & (df_train_rd.y_train!=6)& (df_train_rd.y_train!=15) & (df_train_rd.y_train!=4)]
df_test_or = df_test_rd[(df_test_rd.y_test!=20) & (df_test_rd.y_test!=6) & (df_test_rd.y_test!=15) & (df_test_rd.y_test!=4)]
OuputNumor = len(df_train_or.y_train.unique())
print("----- Fourth Class Num " + str(OuputNumor) + '\n')
df_train_fif = df_train_or[(df_train_or.y_train!=10) & (df_train_or.y_train!=7)& (df_train_or.y_train!=25) & (df_train_or.y_train!=22)]
df_test_fif = df_test_or[(df_test_or.y_test!=10) & (df_test_or.y_test!=7) & (df_test_or.y_test!=25) & (df_test_or.y_test!=22)]
OuputNumfif = len(df_train_fif.y_train.unique())
print("----- Fifth Class Num " + str(OuputNumfif) + '\n')
df_train_six = df_train_fif[(df_train_fif.y_train!=31) & (df_train_fif.y_train!=29)& (df_train_fif.y_train!=19) & (df_train_fif.y_train!=8)]
df_test_six = df_test_fif[(df_test_fif.y_test!=31) & (df_test_fif.y_test!=29) & (df_test_fif.y_test!=19) & (df_test_fif.y_test!=8)]
OuputNumsix = len(df_train_six.y_train.unique())
print("----- Sixth Class Num " + str(OuputNumsix) + '\n')
dfTrain = [df_train,df_train_nd,df_train_rd,df_train_or,df_train_fif,df_train_six]
dfTest = [df_test,df_test_nd,df_test_rd,df_test_or,df_test_fif,df_test_six]
OuputClassNum = [OuputNum,OuputNumnd,OuputNumrd,OuputNumor,OuputNumfif,OuputNumsix]



----- Origin Class Num 32

----- Second Class Num 28

----- Third Class Num 24

----- Fourth Class Num 20

----- Fifth Class Num 16

----- Sixth Class Num 12



In [8]:
class MachineLearningModel:
    def __init__(self, Train,Test):
        tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)  
        tfidfconverter.fit(Train['X_train'])
        XTrain = tfidfconverter.transform(Train['X_train'])
        XTest = tfidfconverter.transform(Test['X_test'])
        print(XTrain.shape)
        print(XTest.shape)
        num = 8000
        selectBest = SelectKBest(chi2, k=num).fit(XTrain, Train['y_train'])
        select_feature = selectBest.transform(XTrain)
        test_features = selectBest.transform(XTest)
        self.X_train = select_feature
        self.X_test = test_features
        self.y_train = Train['y_train']
        self.y_test = Test['y_test']
        
    def CNBC(self):
        X_train = self.X_train
        y_train = self.y_train
        X_test = self.X_test
        y_test= self.y_test
        self.target = np.sort(y_train.unique())
        CNBC = ComplementNB()
        CNBC.fit(X_train,y_train)
        %time
        score = CNBC.score(X_test,y_test)
        return score  
    def OvRSVM(self):
        X_train = self.X_train
        y_train = self.y_train
        X_test = self.X_test
        y_test= self.y_test
        self.target = np.sort(y_train.unique())
        linearSVM = LinearSVC(random_state=42,tol=1e-5,class_weight="balanced")
        linearSVM.fit(X_train,y_train)
        %time
        score = linearSVM.score(X_test,y_test)
        return score
    def OvOSVM(self):
        X_train = self.X_train
        y_train = self.y_train
        X_test = self.X_test
        y_test= self.y_test
        self.target = np.sort(y_train.unique())
        OvOSVC = SVC(gamma='scale', decision_function_shape='ovo')
        OvOSVC.fit(X_train,y_train)
        %time
        score = OvOSVC.score(X_test,y_test)
        return score    

In [None]:
NBCscores = []
for ele in zip(dfTrain,dfTest):
    model = MachineLearningModel(ele[0],ele[1])
    score = model.CNBC()
    NBCscores.append(score)

(93938, 24650)
(23485, 24650)
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs
(51598, 18897)
(12911, 18897)
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
(33756, 15646)
(8436, 15646)
CPU times: user 58 µs, sys: 8 µs, total: 66 µs
Wall time: 12.9 µs
(23013, 12335)
(5772, 12335)
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.63 µs
(15216, 10446)
(3871, 10446)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
(9764, 8379)
(2490, 8379)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [None]:
OvRSVCscores = []
for ele in zip(dfTrain,dfTest):
    model = MachineLearningModel(ele[0],ele[1])
    score = model.OvRSVM()
    OvRSVCscores.append(score)

(93938, 24650)
(23485, 24650)




CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
(51598, 18897)
(12911, 18897)




CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
(33756, 15646)
(8436, 15646)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
(23013, 12335)
(5772, 12335)
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
(15216, 10446)
(3871, 10446)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs
(9764, 8379)
(2490, 8379)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [None]:
OVOSVCscores = []
for ele in zip(dfTrain,dfTest):
    model = MachineLearningModel(ele[0],ele[1])
    score = model.OvOSVM()
    OVOSVCscores.append(score)

(93938, 24650)
(23485, 24650)


In [None]:
plotDifferentNumFeatureInfluence(OuputClassNum,NBCscores,OvRSVCscores, OVOSVCscores)