In [1]:
import os
from sklearn.feature_selection import SelectKBest,chi2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input,Dropout,GlobalMaxPooling1D,MaxPooling1D,LSTM
from keras.metrics import categorical_accuracy

Using TensorFlow backend.


In [2]:
# Choose folder
folder = ['Amazon','FlipKart','Combine','Walmart']
class OpenData:
    def __init__(self,num):
        self.num = num
    def openFile(self):
        num = self.num
        trainData = pd.read_csv(folder[num]+'/X_train.csv')
        trainLabel = pd.read_csv(folder[num]+'/y_train.csv')
        testData = pd.read_csv(folder[num]+'/X_test.csv')
        testLabel = pd.read_csv(folder[num]+'/y_test.csv')
        if(num==0 or num==2):
            # For Description has nan row
            df = pd.concat([trainData,trainLabel], axis = 1)
            df = df.dropna(subset=['X_train'])
            trainData = pd.DataFrame({'X_train':df.X_train})
            trainLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
            df = pd.concat([testData,testLabel], axis = 1)
            df = df.dropna(subset=['X_test'])
            testData = pd.DataFrame({'X_test':df.X_test})
            testLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
        return trainData,trainLabel,testData,testLabel

In [3]:
class DLFeatureSelect:
    def __init__(self,train,trainLabel, test, testLabel, name):
        self.title = name
        self.X_train = trainData['X_train']
        self.X_test =  testData['X_test']
        self.y_train = trainLabel['category']
        self.y_test = testLabel['category']
        self.ynd_train = trainLabel['subcategory']
        self.ynd_test = testLabel['subcategory']
        self.target = np.unique(trainLabel['category'])
        self.outputnum = len(np.unique(trainLabel['category']))
        self.labels = to_categorical(trainLabel['category'])
        self.subtarget = np.unique(trainLabel['subcategory'])
        self.suboutputnum = len(np.unique(trainLabel['subcategory']))
        self.sublabels = to_categorical(trainLabel['subcategory'])

    def wordtoSequence(self):
        tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
        tfidfconverter.fit_transform(self.X_train)
        vocab_size = len(tfidfconverter.get_feature_names())
        print("Total vocabulary size: " +str(vocab_size)+'\n')
        self.vocabSize = vocab_size
        # Tranform Text to sequences   
        print("---- Word to sequence ---- \n")
        tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
        tokenizer.fit_on_texts(self.X_train)
        sequences = tokenizer.texts_to_sequences(self.X_train)
        sequences_test = tokenizer.texts_to_sequences(self.X_test)
        word_index = tokenizer.word_index
        print("Total unique words : " +str(len(word_index))+'\n')
        self.wordIndex = word_index
        return sequences,sequences_test
    def openGloveEmbeddingMatrix(self,dim):
        embedding_dim = dim 
        self.embeddingDim = embedding_dim
        print("---- Use "+ str(dim) +" dimension word vector ---- \n")

        glove_dir = '../glove.6B' # This is the folder with the dataset
        embeddings_index = {} # We create a dictionary of word -> embedding
        with open(os.path.join(glove_dir, 'glove.6B.'+str(dim)+'d.txt')) as f:
            for line in f:
                values = line.split()
                word = values[0] # The first value is the word, the rest are the values of the embedding
                embedding = np.asarray(values[1:], dtype='float32') # Load embedding
                embeddings_index[word] = embedding # Add embedding to our embedding dictionary
        print('Found {:,} word vectors in GloVe.'.format(len(embeddings_index)))
        return embeddings_index
    def creatEmeddingMatrix(self,embeddings_index):
        word_index = self.wordIndex
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        nb_words = min(vocab_size, len(word_index)) # How many words are there actually
        embedding_matrix = np.zeros((nb_words, embedding_dim))
        # The vectors need to be in the same position as their index. 
        # Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on
        # Loop over all words in the word index
        for word, i in word_index.items():
            # If we are above the amount of words we want to use we do nothing
            if i >= vocab_size: 
                continue
            # Get the embedding vector for the word
            embedding_vector = embeddings_index.get(word)
            # If there is an embedding vector, put it in the embedding matrix
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def selectBestFeature(self,X,y,X_test,num):
        select = SelectKBest(chi2, k=num)
        select.fit(X, y)
        X = select.transform(X)
        XTest=select.transform(X_test)
        return X, XTest
    def model_settings(self,length,embeddingMatrix,outputnum):
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=length, weights = [embedding_matrix], 
                                trainable = False))
        model.add(Conv1D(200,3,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(GlobalMaxPooling1D())
        # We add a vanilla hidden layer:
        model.add(Dense(250))
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        return model
    def model_settingsnd(self,length,embeddingMatrix,outputnum):
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=length, weights = [embedding_matrix], 
                                trainable = False))
        model.add(Conv1D(125,5,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(MaxPooling1D(3))
        model.add(Conv1D(125,5,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(MaxPooling1D(3))
        model.add(Conv1D(125,5,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(MaxPooling1D(3))
        model.add(Flatten())
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        return model
    def model_settingsrd(self,length,embeddingMatrix,outputnum):
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=length, weights = [embedding_matrix], 
                                trainable = False))
        model.add(Dropout(0.25))
        model.add(Conv1D(128,5,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(MaxPooling1D(4))
        model.add(LSTM(70))
#         model.add(Flatten())
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        return model
    def creatmodel(self,sequences,sequences_test,embeddingMatrix, num):
        outputnum = self.outputnum
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        train = pad_sequences(sequences,maxlen= max_length)
        test = pad_sequences(sequences_test,maxlen = max_length)
        y_train = self.labels
        y_test = self.y_test
        batch_size = 100
        epochs = 10
        scores = []
        X, XTest = self.selectBestFeature(train,y_train,test,num)
        model = self.model_settings(num,embeddingMatrix,outputnum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        history = model.fit(X, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
        loss, accuracy = model.evaluate(XTest, to_categorical(y_test))
        print("Accuracy: "+str(accuracy)+" Loss: "+str(loss)+"\n")
        return accuracy,loss
    
    
    def creatndmodel(self,sequences,sequences_test,embeddingMatrix,num):
        outputnum = self.outputnum
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        train = pad_sequences(sequences,maxlen= max_length)
        test = pad_sequences(sequences_test,maxlen = max_length)
        y_train = self.labels
        y_test = self.y_test
        batch_size = 100
        epochs = 10
        scores = []
        X, XTest = self.selectBestFeature(train,y_train,test,num)
        model = self.model_settingsnd(num,embeddingMatrix,outputnum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        history = model.fit(X, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
        loss, accuracy = model.evaluate(XTest, to_categorical(y_test))
        print("Accuracy: "+str(accuracy)+" Loss: "+str(loss)+"\n")
        return accuracy,loss
    def creatrdmodel(self,sequences,sequences_test,embeddingMatrix,num):
        outputnum = self.outputnum
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        train = pad_sequences(sequences,maxlen= max_length)
        test = pad_sequences(sequences_test,maxlen = max_length)
        y_train = self.labels
        y_test = self.y_test
        batch_size = 100
        epochs = 10
        scores = []
        X, XTest = self.selectBestFeature(train,y_train,test,num)
        model = self.model_settingsrd(num,embeddingMatrix,outputnum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        history = model.fit(X, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
        loss, accuracy = model.evaluate(XTest, to_categorical(y_test))
        print("Accuracy: "+str(accuracy)+" Loss: "+str(loss)+"\n")
        return accuracy,loss

In [4]:
openData = OpenData(0)
trainData,trainLabel,testData,testLabel = openData.openFile()


In [5]:
dlFS = DLFeatureSelect(trainData,trainLabel,testData,testLabel,folder[0])
sequences,sequences_test =dlFS.wordtoSequence()
embeddings_index = dlFS.openGloveEmbeddingMatrix(300)
embedding_matrix = dlFS.creatEmeddingMatrix(embeddings_index)

Total vocabulary size: 5927

---- Word to sequence ---- 

Total unique words : 24058

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [6]:
dlFS.creatmodel(sequences,sequences_test,embedding_matrix, 650)

W0726 13:00:01.013914 140022623635264 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 13:00:01.036250 140022623635264 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 13:00:01.039340 140022623635264 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0726 13:00:01.048329 140022623635264 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Pl

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 14)                3514      
Total params: 2,012,064
Trainable params: 233,964
Non-trainable params: 1,778,100
____________________________________________________________

(0.7587262706674831, 0.11300876692276748)

In [7]:
dlFS.creatndmodel(sequences,sequences_test,embedding_matrix, 650)

W0726 13:02:34.234203 140022623635264 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 646, 125)          187625    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 215, 125)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 211, 125)          78250     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 70, 125)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 66, 125)           78250     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 22, 125)           0         
__________

(0.6631965707287202, 0.16382945274439123)

In [8]:
dlFS.creatrdmodel(sequences,sequences_test,embedding_matrix, 650)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
dropout_3 (Dropout)          (None, 650, 300)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 646, 128)          192128    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 161, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 70)                55720     
_________________________________________________________________
dropout_4 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 14)                994       
Total para

(0.6950398040416411, 0.11353956748488624)

In [9]:
openData = OpenData(1)
trainData,trainLabel,testData,testLabel = openData.openFile()

In [10]:
dlFS = DLFeatureSelect(trainData,trainLabel,testData,testLabel,folder[1])
sequences,sequences_test =dlFS.wordtoSequence()
embeddings_index = dlFS.openGloveEmbeddingMatrix(300)
embedding_matrix = dlFS.creatEmeddingMatrix(embeddings_index)


Total vocabulary size: 4742

---- Word to sequence ---- 

Total unique words : 16399

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [11]:
dlFS.creatmodel(sequences,sequences_test,embedding_matrix, 500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_5 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 18)                4518      
Total params: 1,657,568
Trainable params: 234,968
Non-trainable params: 1,422,600
____________________________________________________________

(0.9893479664299548, 0.0055019580614669265)

In [12]:
dlFS.creatndmodel(sequences,sequences_test,embedding_matrix, 500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 496, 125)          187625    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 165, 125)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 161, 125)          78250     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 53, 125)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 49, 125)           78250     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 16, 125)           0         
__________

(0.9738540994189799, 0.01220400532760918)

In [14]:
dlFS.creatrdmodel(sequences,sequences_test,embedding_matrix, 500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 300)          1422600   
_________________________________________________________________
dropout_7 (Dropout)          (None, 500, 300)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 496, 128)          192128    
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 124, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 70)                55720     
_________________________________________________________________
dropout_8 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 18)                1278      
Total para

(0.9838605551969012, 0.00636588723947077)

In [15]:
openData = OpenData(2)
trainData,trainLabel,testData,testLabel = openData.openFile()


In [16]:
dlFS = DLFeatureSelect(trainData,trainLabel,testData,testLabel,folder[2])
sequences,sequences_test =dlFS.wordtoSequence()
embeddings_index = dlFS.openGloveEmbeddingMatrix(300)
embedding_matrix = dlFS.creatEmeddingMatrix(embeddings_index)


Total vocabulary size: 9848

---- Word to sequence ---- 

Total unique words : 37793

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [17]:
dlFS.creatmodel(sequences,sequences_test,embedding_matrix,1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 200)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_9 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 9)                 2259      
Total params: 3,187,109
Trainable params: 232,709
Non-trainable params: 2,954,400
____________________________________________________________

(0.9735273492286115, 0.030606828285291193)

In [18]:
dlFS.creatndmodel(sequences,sequences_test,embedding_matrix,1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 1096, 125)         187625    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 365, 125)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 361, 125)          78250     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 120, 125)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 116, 125)          78250     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 38, 125)           0         
__________

(0.760343618513324, 0.1273609535614496)

In [19]:
dlFS.creatrdmodel(sequences,sequences_test,embedding_matrix, 1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1100, 300)         2954400   
_________________________________________________________________
dropout_11 (Dropout)         (None, 1100, 300)         0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 1096, 128)         192128    
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 274, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 70)                55720     
_________________________________________________________________
dropout_12 (Dropout)         (None, 70)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 9)                 639       
Total para

(0.9773842917251052, 0.01965912789516843)

In [20]:
openData = OpenData(3)
trainData,trainLabel,testData,testLabel = openData.openFile()


In [21]:
dlFS = DLFeatureSelect(trainData,trainLabel,testData,testLabel,folder[3])
sequences,sequences_test =dlFS.wordtoSequence()
embeddings_index = dlFS.openGloveEmbeddingMatrix(300)
embedding_matrix = dlFS.creatEmeddingMatrix(embeddings_index)


Total vocabulary size: 24650

---- Word to sequence ---- 

Total unique words : 107100

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [22]:
dlFS.creatmodel(sequences,sequences_test,embedding_matrix,350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 200)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_13 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 32)                8032      
Total params: 7,633,482
Trainable params: 238,482
Non-trainable params: 7,395,000
____________________________________________________________

(0.8581222056504993, 0.03641261019907957)

In [23]:
dlFS.creatndmodel(sequences,sequences_test,embedding_matrix,350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 346, 125)          187625    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 115, 125)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 111, 125)          78250     
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 37, 125)           0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 33, 125)           78250     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 11, 125)           0         
__________

(0.8364913775079779, 0.040383666406381837)

In [24]:
dlFS.creatrdmodel(sequences,sequences_test,embedding_matrix, 350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
dropout_15 (Dropout)         (None, 350, 300)          0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 346, 128)          192128    
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 86, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 70)                55720     
_________________________________________________________________
dropout_16 (Dropout)         (None, 70)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 32)                2272      
Total para

(0.8436022993501562, 0.029303839993566966)