In [26]:
import pandas as pd 
import numpy as np
import os 
import json
from math import ceil, floor

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.preprocessing import LabelEncoder,scale
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.utils.multiclass import unique_labels

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 11})

import re
import nltk
from nltk.corpus import stopwords


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, Dropout,BatchNormalization, GlobalMaxPooling1D,MaxPooling1D,LSTM
from keras.metrics import categorical_accuracy
from keras.callbacks import  EarlyStopping


In [27]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;-]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
Number_RE = re.compile('[*^0-9]')
Bad_underline = re.compile('[*_*]')
RemoveTag = re.compile('&lt;|br&gt;|b&gt;|ul&gt;|li&gt;')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = RemoveTag.sub('',text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = Number_RE.sub(' ', text) # replace Number symbols by space in text
    text = Bad_underline.sub(' ', text) # replace Underline symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
RemoveLastSpace = re.compile(' $')

def clean_text_category(text):
    text = RemoveLastSpace.sub('',text)
    return text
    

In [28]:
def classNumberThreshold(arr):
    dropCategory = []

    for key,value in arr.items():
        if(value<=30):
            dropCategory.append(key)
    return dropCategory

In [29]:
df = pd.read_csv('../../example/amazon_co-ecommerce_sample.csv')

In [30]:
df = df[pd.notnull(df['amazon_category_and_sub_category'])]
df = df[pd.notnull(df['description'])]
df = df[pd.notnull(df['uniq_id'])]
category = []
subcategory = []
sub2category = []
for ele in df['amazon_category_and_sub_category'].apply(lambda x: str(x).split('>')):
    category.append(ele[0])
    if(len(ele)>2):
        subcategory.append(ele[1])
        sub2category.append(ele[2])
    elif(len(ele)>1):
        subcategory.append(ele[1])
        sub2category.append(np.NaN)
    else:
        subcategory.append(np.NaN)
        sub2category.append(np.NaN)

data= {'uniq_id':df['uniq_id'], 'product_name':df['product_name'],'category_main':category,'category_sub1':subcategory,'description':df['description']}
df = pd.DataFrame(data)
df['description'] = df['description'].apply(clean_text)
df['product_name'] = df['product_name'].apply(clean_text)


df.category_main = df['category_main'].apply(clean_text_category)
df.category_sub1 = df['category_sub1'].apply(clean_text_category)

df = df[pd.notnull(df['category_main'])]
df = df[pd.notnull(df['category_sub1'])]


In [31]:
dropCategoryCode  = classNumberThreshold(df.category_main.value_counts())
dropSubCategoryCode  = classNumberThreshold(df.category_sub1.value_counts())

for i in dropCategoryCode:
    df = df[df.category_main!=i]
    
for i in dropSubCategoryCode:
    df = df[df.category_sub1!=i]



In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8189 entries, 0 to 9998
Data columns (total 5 columns):
uniq_id          8189 non-null object
product_name     8189 non-null object
category_main    8189 non-null object
category_sub1    8189 non-null object
description      8189 non-null object
dtypes: object(5)
memory usage: 383.9+ KB


In [33]:
embedding_dim = 300 # We use 100 dimensional glove vectors
glove_dir = '../../glove.6B' # This is the folder with the dataset
embeddings_index = {} # We create a dictionary of word -> embedding
with open(os.path.join(glove_dir, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0] # The first value is the word, the rest are the values of the embedding
        embedding = np.asarray(values[1:], dtype='float32') # Load embedding
        embeddings_index[word] = embedding # Add embedding to our embedding dictionary
    print('Found {:,} word vectors in GloVe.'.format(len(embeddings_index)))

Found 400,000 word vectors in GloVe.


In [34]:
le = LabelEncoder()
le.fit(df.category_main)
target = le.classes_
labels = le.transform(df.category_main)

le.fit(df.category_sub1.apply(str))
subtarget = le.classes_
sublabels = le.transform(df['category_sub1'])

In [35]:
X_train, X_test, y_train_info, y_test_info = train_test_split(df.product_name, pd.DataFrame({'index':df.index, 'label':labels}), 
                                                    test_size=0.1, random_state = 27)

In [36]:
y_train = y_train_info.label
y_test = y_test_info.label


In [37]:
tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(df.product_name)
vocab_size = len(tfidfconverter.get_feature_names())


In [38]:
def tokenizer_and_pad_sequence (Xtrain,Xtest,vocab_size,embeddings_index):
    tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
    tokenizer.fit_on_texts(Xtrain)
    
    sequences = tokenizer.texts_to_sequences(Xtrain)
    sequences_test = tokenizer.texts_to_sequences(Xtest)
    
    trainlengths = [len(ele) for ele in sequences]
    testlengths = [len(ele) for ele in sequences_test]
    max_length = min(max(trainlengths),max(testlengths))
    
    
    
    word_index = tokenizer.word_index
    embedding_dim = 300
    nb_words = min(vocab_size, len(word_index)) # How many words are there actually
    embedding_matrix = np.zeros((nb_words, embedding_dim))
    # The vectors need to be in the same position as their index. 
    # Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on
    # Loop over all words in the word index
    for word, i in word_index.items():
        # If we are above the amount of words we want to use we do nothing
        if i >= vocab_size: 
            continue
        # Get the embedding vector for the word
        embedding_vector = embeddings_index.get(word)
        # If there is an embedding vector, put it in the embedding matrix
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    return sequences,sequences_test,max_length, embedding_matrix

def model_settings(length,vocabSize,embeddingMatrix,outputnum):
        embedding_dim = 300
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=length, weights = [embedding_matrix], 
                                trainable = False))
        model.add(Conv1D(200,3,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(GlobalMaxPooling1D())
        model.add(BatchNormalization())
        # We add a vanilla hidden layer:
        model.add(Dense(250))
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        return model
def model_settings2(length,vocabSize,embeddingMatrix,outputnum):
        embedding_dim = 300
        model = Sequential()
        model.add(Embedding(vocabSize, embedding_dim, input_length=length, weights = [embeddingMatrix], 
                                trainable = False))
        model.add(Dropout(0.25))
        model.add(Conv1D(128,5,padding='valid',activation='relu',strides=1))        
        # we use max pooling:
        model.add(MaxPooling1D(4))
        model.add(BatchNormalization())
        model.add(LSTM(70))
        model.add(Dropout(0.2))
        model.add(Dense(outputnum, activation='softmax'))
        model.summary()
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])
        return model

In [51]:
categoricalLabel = to_categorical(y_train)
categoricalTestLabel = to_categorical(y_test)

EPOCHS = 10
BATCH_SIZE = 100

#set early stopping criteria
pat = 5 #this is the number of epochs with no improvment after which the training will stop
early_stopping = EarlyStopping(monitor='val_loss', patience=pat, verbose=1)


sequences,sequences_test,max_length, embedding_matrix =  tokenizer_and_pad_sequence(X_train,X_test, vocab_size, embeddings_index)

Scores = []
Loss = []
for ele in range(5,50,5):

    train = pad_sequences(sequences,maxlen= ele)
    test = pad_sequences(sequences_test,maxlen = ele)

    model = model_settings(ele, vocab_size, embedding_matrix, len(np.unique(y_train)))
    # model2 = model_settings2(20, vocab_size, embedding_matrix, len(np.unique(y_train)))


    model.fit(train, categoricalLabel, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping],  
                       verbose=1, validation_split=0.1)
    # model2.fit(train, categoricalLabel, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping],  
    #                    verbose=1, validation_split=0.1)

    loss, score = model.evaluate(test, categoricalTestLabel, batch_size=BATCH_SIZE)
    Scores.append(score)
    Loss.append(loss)
# loss2, score2 = model2.evaluate(test, categoricalTestLabel, batch_size=BATCH_SIZE)








_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 5, 300)            597600    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 3, 200)            180200    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 200)               0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 200)               800       
_________________________________________________________________
dense_12 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_8 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 14)                3514      
Total para

Train on 6633 samples, validate on 737 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 00006: early stopping
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 25, 300)           597600    
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 23, 200)           180200    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 200)               0         
_________________________________________________________________
batch_normalization_11 (Batc (None, 200)               800       
_________________________________________________________________
dense_20 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_12 (Dropout)         (None, 250)               0         
_

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00007: early stopping
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 40, 300)           597600    
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 38, 200)           180200    
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 200)               0         
_________________________________________________________________
batch_normalization_14 (Batc (None, 200)               800       
_________________________________________________________________
dense_26 (Dense)             (None, 250)               50250     
_________________________________________________________________
dropout_15 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_27 (Dense

In [56]:
results = list(zip(Loss,Scores))

In [68]:
Result = ""

for ele in results:
    result = "Model 1 Loss: {0:.2f} Score: {1:.2f} ".format(ele[0],ele[1])
    print(result)

Model 1 Loss: 0.14 Score: 0.75 
Model 1 Loss: 0.11 Score: 0.80 
Model 1 Loss: 0.09 Score: 0.83 
Model 1 Loss: 0.10 Score: 0.80 
Model 1 Loss: 0.10 Score: 0.83 
Model 1 Loss: 0.10 Score: 0.83 
Model 1 Loss: 0.10 Score: 0.81 
Model 1 Loss: 0.10 Score: 0.83 
Model 1 Loss: 0.10 Score: 0.83 


In [71]:
tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(df.product_name)
featureNames = tfidfconverter.get_feature_names()

In [72]:
X_train, X_test, y_train_info, y_test_info = train_test_split(X, pd.DataFrame({'index':df.index, 'label':labels}), 
                                                    test_size=0.1, random_state = 27)

In [74]:
BestSize  = [ele for ele in range(50, len(featureNames), 100) ]
SVCModels = []
SVCScores = []
SelectModels  = []
for ele in range(50, len(featureNames), 100):
    print("--- Best "+ str(ele) + " features \n")
    selectBest = SelectKBest(chi2, k= ele)
    model = LinearSVC(random_state=42,class_weight="balanced")
    K_best_linearsvc = Pipeline([('SelectBest', selectBest), ('linearSVC', model)])
    K_best_linearsvc.fit(X_train,y_train)
    score = K_best_linearsvc.score(X_test,y_test)
    SVCModels.append(K_best_linearsvc)
    SVCScores.append(score)

--- Best 50 features 

--- Best 150 features 

--- Best 250 features 

--- Best 350 features 

--- Best 450 features 

--- Best 550 features 

--- Best 650 features 

--- Best 750 features 

--- Best 850 features 

--- Best 950 features 

--- Best 1050 features 

--- Best 1150 features 

--- Best 1250 features 

--- Best 1350 features 

--- Best 1450 features 

--- Best 1550 features 

--- Best 1650 features 

--- Best 1750 features 

--- Best 1850 features 

--- Best 1950 features 



In [79]:
for ele in SVCScores:
    Result = "Model 1 Score: {0:.2f} ".format(ele)
    print(Result)

Model 1 Score: 0.57 
Model 1 Score: 0.70 
Model 1 Score: 0.74 
Model 1 Score: 0.77 
Model 1 Score: 0.79 
Model 1 Score: 0.80 
Model 1 Score: 0.79 
Model 1 Score: 0.80 
Model 1 Score: 0.81 
Model 1 Score: 0.82 
Model 1 Score: 0.83 
Model 1 Score: 0.83 
Model 1 Score: 0.83 
Model 1 Score: 0.83 
Model 1 Score: 0.83 
Model 1 Score: 0.84 
Model 1 Score: 0.84 
Model 1 Score: 0.84 
Model 1 Score: 0.84 
Model 1 Score: 0.84 
