In [1]:
import os
from sklearn.feature_selection import SelectKBest,chi2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input,Dropout,GlobalMaxPooling1D,MaxPooling1D
from keras.metrics import categorical_accuracy

Using TensorFlow backend.


In [2]:
# Choose folder
folder = ['Amazon','FlipKart','Combine','Walmart']
class OpenData:
    def __init__(self,num):
        self.num = num
    def openFile(self):
        num = self.num
        trainData = pd.read_csv(folder[num]+'/X_train.csv')
        trainLabel = pd.read_csv(folder[num]+'/y_train.csv')
        testData = pd.read_csv(folder[num]+'/X_test.csv')
        testLabel = pd.read_csv(folder[num]+'/y_test.csv')
        if(num==0 or num==2):
            # For Description has nan row
            df = pd.concat([trainData,trainLabel], axis = 1)
            df = df.dropna(subset=['X_train'])
            trainData = pd.DataFrame({'X_train':df.X_train})
            trainLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
            df = pd.concat([testData,testLabel], axis = 1)
            df = df.dropna(subset=['X_test'])
            testData = pd.DataFrame({'X_test':df.X_test})
            testLabel = pd.DataFrame({'category':df.category,'subcategory':df.subcategory})
        return trainData,trainLabel,testData,testLabel

In [3]:
class HierarchicalModel:
    def __init__(self,trainData, trainLabel ,testData, testLabel):
        # X features         
        self.X_train = trainData['X_train']
        self.X_test =  testData['X_test']
        # y labels 
        self.y_train = trainLabel['category']
        self.y_test = testLabel['category']
        self.ynd_train = trainLabel['subcategory']
        self.ynd_test = testLabel['subcategory']
        # y categorical labels
        self.labels = to_categorical(trainLabel['category'])
        self.sublabels = to_categorical(trainLabel['subcategory'])
        # targets 
        self.target = np.unique(trainLabel['category'])
        self.subtarget = np.unique(trainLabel['subcategory'])
        # Output numbers
        self.outputnum = len(np.unique(trainLabel['category']))
        self.suboutputnum = len(np.unique(trainLabel['subcategory']))
        # TrainList   
        train = pd.concat([trainData,trainLabel], axis = 1)

        self.trainList = train.groupby('category')
    def wordToSequence(self):
        parenttoChildFeature = {}
        parenttoChildSubcategory = {}
        uniqueCategory = self.target
        trainList = self.trainList
        Xtrain = self.X_train
        Xtest = self.X_test
        tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
        X = tfidfconverter.fit_transform(Xtrain)
        vocab_size = len(tfidfconverter.get_feature_names())
        print("----- Vocabulary size : "+ str(vocab_size)  +" ----- \n")
        tokenizer = Tokenizer(num_words=vocab_size) 
        tokenizer.fit_on_texts(Xtrain)
        print("----- Convert train and test data to sequences ----- \n")
        sequences = tokenizer.texts_to_sequences(Xtrain)
        sequences_test = tokenizer.texts_to_sequences(Xtest)
        
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        
        word_index = tokenizer.word_index
        print("----- Total unique words : %d -----\n",len(word_index))
        print("----- Convert train data to vector in second level ----- \n")
        for ele in uniqueCategory:
            subcategoryData = trainList.get_group(ele)
            X_sub = tokenizer.texts_to_sequences(subcategoryData['X_train'])
            parenttoChildFeature[ele] = X_sub
            parenttoChildSubcategory[ele] = subcategoryData['subcategory']
        self.wordIndex = word_index
        self.vocabSize = vocab_size
        self.max_length = max_length
        return parenttoChildFeature,parenttoChildSubcategory, sequences, sequences_test
    
    def openGloveEmbeddingMatrix(self,dim):
        embedding_dim = dim 
        self.embeddingDim = embedding_dim
        print("---- Use "+ str(dim) +" dimension word vector ---- \n")

        glove_dir = '../glove.6B' # This is the folder with the dataset
        embeddings_index = {} # We create a dictionary of word -> embedding
        with open(os.path.join(glove_dir, 'glove.6B.'+str(dim)+'d.txt')) as f:
            for line in f:
                values = line.split()
                word = values[0] # The first value is the word, the rest are the values of the embedding
                embedding = np.asarray(values[1:], dtype='float32') # Load embedding
                embeddings_index[word] = embedding # Add embedding to our embedding dictionary
        print('Found {:,} word vectors in GloVe.'.format(len(embeddings_index)))
        return embeddings_index
    
    def creatEmeddingMatrix(self,embeddings_index):
        word_index = self.wordIndex
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        nb_words = min(vocab_size, len(word_index)) # How many words are there actually
        embedding_matrix = np.zeros((nb_words, embedding_dim))
        # The vectors need to be in the same position as their index. 
        # Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on
        # Loop over all words in the word index
        for word, i in word_index.items():
            # If we are above the amount of words we want to use we do nothing
            if i >= vocab_size: 
                continue
            # Get the embedding vector for the word
            embedding_vector = embeddings_index.get(word)
            # If there is an embedding vector, put it in the embedding matrix
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def selectBestfeatureViaChi2(self,Xtrain, Ytrain, Xtest, num):
        selectBest = SelectKBest(chi2, k=num).fit(Xtrain, Ytrain)
        Xtrainbest = selectBest.transform(Xtrain)
        Xtestbest = selectBest.transform(Xtest)
        return Xtrainbest,Xtestbest
    def model_settings(self,length,embeddingMatrix,outputnum):
        vocab_size = self.vocabSize
        embedding_dim = self.embeddingDim
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=length, weights = [embedding_matrix], 
                                trainable = False))
        model.add(Conv1D(200,3,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(GlobalMaxPooling1D())
        # We add a vanilla hidden layer:
        model.add(Dense(250))
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        return model
    
    def subclassifiers(self,feature, subcategory, embeddingMatrix, num):
        classifiers = {}
        max_length= self.max_length
        for key, values in feature.items():
            numofUniqueSubcategory = len(np.unique(subcategory[key]))
            if (numofUniqueSubcategory>1):
                Xtrain = pad_sequences(values,maxlen= num)
                le = LabelEncoder()
                le.fit(subcategory[key])
                target = le.classes_
                labels = le.transform(subcategory[key])
                batch_size = 100
                epochs = 10
                model = self.model_settings(num,embeddingMatrix,numofUniqueSubcategory)
                model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
                history = model.fit(Xtrain,to_categorical(labels), batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
                classifiers[key] = [model,target]
            else:
                classifiers[key] = np.unique(subcategory[key])[0]
        return classifiers
    def firstLevelModelTraining(self,sequences,sequences_test,embeddingMatrix,num):
        outputnum = self.outputnum
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        train = pad_sequences(sequences,maxlen= max_length)
        test = pad_sequences(sequences_test,maxlen = max_length)
        y_train = self.labels
        y_test = self.y_test
        batch_size = 100
        epochs = 10
        scores = []
        X, XTest = self.selectBestfeatureViaChi2(train,y_train,test,num)
        model = self.model_settings(num,embeddingMatrix,outputnum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        history = model.fit(X, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
        %time
        y_pred = model.predict(XTest)
        predict = []
        for ele in y_pred:
            predict.append(np.argmax(ele))
        return predict

    def PredictSecondLevel(self,classifiers,Ypred ,Xtest,num):
        predict = []
        index = 0            
        Ytest = self.ynd_test
        test = pad_sequences(Xtest,maxlen= num)
        %time
        for ele in Ypred:
            classifier =  classifiers[ele]
            if(type(classifier)!=np.int64):
                model =  classifier[0]
                target = classifier[1]
                y_pred = model.predict(test[index:index+1])
                predict.append(target[np.argmax(y_pred[0])])
            else:
                y_pred = classifier
                predict.append(y_pred)
            index = index + 1
        print(classification_report(Ytest, predict,labels=np.unique(Ytest)))
        return predict
    def FlatApproach(self, sequences,sequences_test,embeddingMatrix,num):
        outputnum = self.suboutputnum
        trainlengths = [len(ele) for ele in sequences]
        testlengths = [len(ele) for ele in sequences_test]
        max_length = max(max(trainlengths),max(testlengths))
        train = pad_sequences(sequences,maxlen= max_length)
        test = pad_sequences(sequences_test,maxlen = max_length)
        y_train = self.sublabels
        y_test = self.ynd_test
        batch_size = 100
        epochs = 10
        scores = []
        X, XTest = self.selectBestfeatureViaChi2(train,y_train,test,num)
        model = self.model_settings(num,embeddingMatrix,outputnum)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        history = model.fit(X, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)
        %time
        y_pred = model.predict(XTest)
        predict = []
        for ele in y_pred:
            predict.append(np.argmax(ele))
        print(classification_report(y_test, predict,labels=np.unique(y_test)))

        return predict

In [4]:
openData = OpenData(0)
trainData,trainLabel,testData,testLabel = openData.openFile()

In [5]:
HCM = HierarchicalModel(trainData,trainLabel,testData,testLabel)


In [6]:
parenttoChildFeature,parenttoChildSubcategory, sequences, sequences_test = HCM.wordToSequence()

----- Vocabulary size : 5927 ----- 

----- Convert train and test data to sequences ----- 

----- Total unique words : %d -----
 24058
----- Convert train data to vector in second level ----- 



In [7]:
embeddings_index = HCM.openGloveEmbeddingMatrix(300)

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [8]:
embedding_matrix = HCM.creatEmeddingMatrix(embeddings_index)

In [9]:
classifiers= HCM.subclassifiers(parenttoChildFeature,parenttoChildSubcategory,embedding_matrix,650)

W0727 13:23:17.263332 139870887159616 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0727 13:23:17.286681 139870887159616 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 13:23:17.290698 139870887159616 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 13:23:17.305935 139870887159616 deprecation_wrapper.py:119] From /home/justin/classification/app/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Pl

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1506      
Total params: 2,010,056
Trainable params: 231,956
Non-trainable params: 1,778,100
____________________________________________________________

Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 200)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 502       
Total params: 2,009,052
Trainable params: 230,952
Non-trainable params: 1,778,100
_____________________________________

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 200)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_7 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 6)                 1506      
Total params: 2,010,056
Trainable params: 231,956
Non-trainable params: 1,778,100
_______________

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 200)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_10 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 2)                 502       
Total params: 2,009,052
Trainable params: 230,952
Non-trainable params: 1,7

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
y_pred = HCM.firstLevelModelTraining(sequences,sequences_test,embedding_matrix,650)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 200)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_13 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 14)                3514      
Total params: 2,012,064
Trainable params: 233,964
Non-trainable params: 1,778,100
____________________________________________________________

In [11]:
y_nd = HCM.PredictSecondLevel(classifiers,y_pred ,sequences_test,650)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs
              precision    recall  f1-score   support

           0       0.83      0.71      0.77         7
           1       0.47      0.22      0.30        37
           2       0.63      0.79      0.70        72
           3       1.00      0.70      0.82        10
           4       0.75      0.85      0.80        61
           5       0.76      0.67      0.71        39
           6       0.71      0.77      0.74        13
           7       0.32      0.57      0.41        35
           8       1.00      0.95      0.97        19
           9       0.50      0.43      0.46         7
          10       0.94      0.94      0.94        78
          11       0.47      0.23      0.31        30
          12       0.00      0.00      0.00         6
          13       0.50      0.69      0.58        13
          14       0.60      0.39      0.48        38
          15       0.33      0.17      0.22         6
          16 

  'precision', 'predicted', average, warn_for)


In [12]:
y_flat_nd = HCM.FlatApproach(sequences,sequences_test,embedding_matrix,650)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 650, 300)          1778100   
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 648, 200)          180200    
_________________________________________________________________
global_max_pooling1d_14 (Glo (None, 200)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_14 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 48)                12048     
Total params: 2,020,598
Trainable params: 242,498
Non-trainable params: 1,778,100
____________________________________________________________

  'precision', 'predicted', average, warn_for)


In [13]:
openData = OpenData(1)
trainData,trainLabel,testData,testLabel = openData.openFile()

In [14]:
HCM = HierarchicalModel(trainData,trainLabel,testData,testLabel)


In [15]:
parenttoChildFeature,parenttoChildSubcategory, sequences, sequences_test = HCM.wordToSequence()

----- Vocabulary size : 4742 ----- 

----- Convert train and test data to sequences ----- 

----- Total unique words : %d -----
 16399
----- Convert train data to vector in second level ----- 



In [16]:
embeddings_index = HCM.openGloveEmbeddingMatrix(300)

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [17]:
embedding_matrix = HCM.creatEmeddingMatrix(embeddings_index)

In [18]:
classifiers= HCM.subclassifiers(parenttoChildFeature,parenttoChildSubcategory,embedding_matrix,500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_15 (Glo (None, 200)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_15 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 2)                 502       
Total params: 1,653,552
Trainable params: 230,952
Non-trainable params: 1,422,600
____________________________________________________________

Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_18 (Glo (None, 200)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_18 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 3)                 753       
Total params: 1,653,803
Trainable params: 231,203
Non-trainable params: 1,422,600
_____________________________________

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_21 (Glo (None, 200)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_21 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 4)                 1004      
Total params: 1,654,054
Trainable params: 231,454
Non-trainable params: 1,422,600
_______________

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_24 (Glo (None, 200)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_24 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_48 (Dense)             (None, 4)                 1004      
Total params: 1,654,054
Trainable params: 231,454
Non-trainable params: 1,4

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y_pred = HCM.firstLevelModelTraining(sequences,sequences_test,embedding_matrix,500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_27 (Glo (None, 200)               0         
_________________________________________________________________
dense_53 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_27 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_54 (Dense)             (None, 18)                4518      
Total params: 1,657,568
Trainable params: 234,968
Non-trainable params: 1,422,600
____________________________________________________________

In [20]:
y_nd = HCM.PredictSecondLevel(classifiers,y_pred ,sequences_test,500)

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.2 µs
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       0.97      0.99      0.98       152
           2       1.00      1.00      1.00        42
           3       1.00      1.00      1.00        35
           4       1.00      0.98      0.99       125
           5       1.00      1.00      1.00        14
           6       1.00      1.00      1.00        34
           7       1.00      1.00      1.00        12
           8       1.00      1.00      1.00        13
           9       0.89      0.73      0.80        11
          10       1.00      1.00      1.00        53
          11       0.00      0.00      0.00         5
          12       1.00      1.00      1.00        13
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        21
          15       1.00      1.00      1.00        15
          16 

  'precision', 'predicted', average, warn_for)


In [21]:
y_flat_nd = HCM.FlatApproach(sequences,sequences_test,embedding_matrix,500)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 500, 300)          1422600   
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 498, 200)          180200    
_________________________________________________________________
global_max_pooling1d_28 (Glo (None, 200)               0         
_________________________________________________________________
dense_55 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_28 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_56 (Dense)             (None, 41)                10291     
Total params: 1,663,341
Trainable params: 240,741
Non-trainable params: 1,422,600
____________________________________________________________

In [22]:
openData = OpenData(2)
trainData,trainLabel,testData,testLabel = openData.openFile()

In [23]:
HCM = HierarchicalModel(trainData,trainLabel,testData,testLabel)


In [24]:
parenttoChildFeature,parenttoChildSubcategory, sequences, sequences_test = HCM.wordToSequence()

----- Vocabulary size : 9848 ----- 

----- Convert train and test data to sequences ----- 

----- Total unique words : %d -----
 37793
----- Convert train data to vector in second level ----- 



In [25]:
embeddings_index = HCM.openGloveEmbeddingMatrix(300)

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [26]:
embedding_matrix = HCM.creatEmeddingMatrix(embeddings_index)

In [27]:
classifiers= HCM.subclassifiers(parenttoChildFeature,parenttoChildSubcategory,embedding_matrix,1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_29 (Glo (None, 200)               0         
_________________________________________________________________
dense_57 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_29 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 2)                 502       
Total params: 3,185,352
Trainable params: 230,952
Non-trainable params: 2,954,400
____________________________________________________________

Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_32 (Glo (None, 200)               0         
_________________________________________________________________
dense_63 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_32 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_64 (Dense)             (None, 5)                 1255      
Total params: 3,186,105
Trainable params: 231,705
Non-trainable params: 2,954,400
_____________________________________

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_35 (Glo (None, 200)               0         
_________________________________________________________________
dense_69 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_35 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_70 (Dense)             (None, 7)                 1757      
Total params: 3,186,607
Trainable params: 232,207
Non-trainable params: 2,954,400
_______________

In [28]:
y_pred = HCM.firstLevelModelTraining(sequences,sequences_test,embedding_matrix,1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_37 (Embedding)     (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_37 (Glo (None, 200)               0         
_________________________________________________________________
dense_73 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_37 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 9)                 2259      
Total params: 3,187,109
Trainable params: 232,709
Non-trainable params: 2,954,400
____________________________________________________________

In [29]:
y_nd = HCM.PredictSecondLevel(classifiers,y_pred ,sequences_test,1100)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10.3 µs
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       174
           1       1.00      0.53      0.69        17
           2       0.73      0.66      0.70        74
           3       1.00      0.94      0.97        18
           4       0.97      0.95      0.96        39
           5       0.67      0.40      0.50        10
           6       0.94      0.89      0.92        37
           7       1.00      0.38      0.55         8
           8       0.82      1.00      0.90        18
           9       0.97      1.00      0.99        77
          10       0.75      0.92      0.83        26
          11       0.50      0.60      0.55        15
          12       1.00      1.00      1.00         9
          13       0.73      0.73      0.73        11
          14       0.86      0.97      0.91        99
          15       0.84      0.99      0.91       229
          16 

In [30]:
y_flat_nd = HCM.FlatApproach(sequences,sequences_test,embedding_matrix,1100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_38 (Embedding)     (None, 1100, 300)         2954400   
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 1098, 200)         180200    
_________________________________________________________________
global_max_pooling1d_38 (Glo (None, 200)               0         
_________________________________________________________________
dense_75 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_38 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_76 (Dense)             (None, 33)                8283      
Total params: 3,193,133
Trainable params: 238,733
Non-trainable params: 2,954,400
____________________________________________________________

In [31]:
openData = OpenData(3)
trainData,trainLabel,testData,testLabel = openData.openFile()

In [32]:
HCM = HierarchicalModel(trainData,trainLabel,testData,testLabel)


In [33]:
parenttoChildFeature,parenttoChildSubcategory, sequences, sequences_test = HCM.wordToSequence()

----- Vocabulary size : 24650 ----- 

----- Convert train and test data to sequences ----- 

----- Total unique words : %d -----
 107100
----- Convert train data to vector in second level ----- 



In [34]:
embeddings_index = HCM.openGloveEmbeddingMatrix(300)

---- Use 300 dimension word vector ---- 

Found 400,000 word vectors in GloVe.


In [35]:
embedding_matrix = HCM.creatEmeddingMatrix(embeddings_index)

In [36]:
classifiers= HCM.subclassifiers(parenttoChildFeature,parenttoChildSubcategory,embedding_matrix,350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_39 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_39 (Glo (None, 200)               0         
_________________________________________________________________
dense_77 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_39 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_78 (Dense)             (None, 5)                 1255      
Total params: 7,626,705
Trainable params: 231,705
Non-trainable params: 7,395,000
____________________________________________________________

Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_42 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_42 (Glo (None, 200)               0         
_________________________________________________________________
dense_83 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_42 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_84 (Dense)             (None, 23)                5773      
Total params: 7,631,223
Trainable params: 236,223
Non-trainable params: 7,395,000
_____________________________________

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_45 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_45 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_45 (Glo (None, 200)               0         
_________________________________________________________________
dense_89 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_45 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_90 (Dense)             (None, 3)                 753       
Total params: 7,626,203
Trainable params: 231,203
Non-trainable params: 7,395,000
_______________

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_48 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_48 (Glo (None, 200)               0         
_________________________________________________________________
dense_95 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_48 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_96 (Dense)             (None, 12)                3012      
Total params: 7,628,462
Trainable params: 233,462
Non-trainable 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_51 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_51 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_51 (Glo (None, 200)               0         
_________________________________________________________________
dense_101 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_51 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_102 (Dense)            (None, 13)                3263      
Total params: 7,628,713
Trainable params: 

Train on 4878 samples, validate on 1220 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_54 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_54 (Glo (None, 200)               0         
_________________________________________________________________
dense_107 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_54 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_108 (Dense)            (None, 7)           

Train on 560 samples, validate on 140 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_57 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_57 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_57 (Glo (None, 200)               0         
_________________________________________________________________
dense_113 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_57 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_114 (Dense)            (None, 13)            

Train on 894 samples, validate on 224 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_60 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_60 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_60 (Glo (None, 200)               0         
_________________________________________________________________
dense_119 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_60 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_120 (Dense)            (None, 3)             

Train on 656 samples, validate on 164 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_63 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_63 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_63 (Glo (None, 200)               0         
_________________________________________________________________
dense_125 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_63 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_126 (Dense)            (None, 10)            

Train on 221 samples, validate on 56 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
y_pred = HCM.firstLevelModelTraining(sequences,sequences_test,embedding_matrix,350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_66 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_66 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_66 (Glo (None, 200)               0         
_________________________________________________________________
dense_131 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_66 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_132 (Dense)            (None, 32)                8032      
Total params: 7,633,482
Trainable params: 238,482
Non-trainable params: 7,395,000
____________________________________________________________

In [38]:
y_nd = HCM.PredictSecondLevel(classifiers,y_pred ,sequences_test,350)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       207
           1       0.79      0.50      0.61        30
           2       0.68      0.88      0.77       523
           3       0.83      0.56      0.67        18
           4       0.45      0.29      0.36        34
           5       0.00      0.00      0.00         9
           6       0.33      0.14      0.20        35
           7       0.50      0.17      0.25         6
           8       0.75      0.27      0.40        44
           9       0.71      0.27      0.39        44
          10       0.14      0.09      0.11        11
          11       0.38      0.21      0.27        14
          12       1.00      0.64      0.78        11
          13       1.00      0.33      0.50         6
          14       0.50      0.33      0.40        36
          15       0.96      0.66      0.78        38
          16       

  'precision', 'predicted', average, warn_for)


In [39]:
y_flat_nd = HCM.FlatApproach(sequences,sequences_test,embedding_matrix,350)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_67 (Embedding)     (None, 350, 300)          7395000   
_________________________________________________________________
conv1d_67 (Conv1D)           (None, 348, 200)          180200    
_________________________________________________________________
global_max_pooling1d_67 (Glo (None, 200)               0         
_________________________________________________________________
dense_133 (Dense)            (None, 250)               50250     
_________________________________________________________________
dropout_67 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_134 (Dense)            (None, 202)               50702     
Total params: 7,676,152
Trainable params: 281,152
Non-trainable params: 7,395,000
____________________________________________________________

  'precision', 'predicted', average, warn_for)
