## Importing most libraries used and initial data evaluation

In [2]:
import numpy as np
import pandas as pd
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
df = pd.read_csv('product-cat-dataset.csv')
print(df.describe()) #describe will give unique numbers of classifiers per level and implied nulls from counts
df[:20] #show first lot of rows to get a visual understanding for what's in the data

                                              Description   Level_1 Level_2  \
count                                               10637     10649   10649   
unique                                               9677        15      39   
top     glory gorg col fing complet outfit express moo...  B092BA29   2D5A3   
freq                                                   24       900     797   

       Level_3  
count    10649  
unique      43  
top       28A7  
freq       797  


Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF
5,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
6,mit warm protect real stay dainty littl hand p...,09BF5150,C7E19,D06E
7,fal back cozy bas toughskin inf toddl girl mic...,2CEC27F1,ADAD6,98CF
8,ev smal lumberjack nee cozy look cool newborn ...,2CEC27F1,ADAD6,98CF
9,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF


### Null values found and dropped

In [4]:
#12 nulls present in description column
df['Description'].isnull().sum()

12

In [5]:
#Nulls simply dropped as imputation would be very difficult given data type and 12 rows out of ~10,000 will have little impact
df.dropna(inplace = True)
df.isnull().sum()

Description    0
Level_1        0
Level_2        0
Level_3        0
dtype: int64

### Dropped categories with n<10

In [6]:
for level in ['Level_1', 'Level_2', 'Level_3']:
    codes = df[level].value_counts()<10        #returns list of unique values and their counts
    to_remove = list(codes[codes].index)       #gets the codes to remove as a list
    df = df[~df[level].isin(to_remove)]        #filter df on the list

df = df.reset_index()
df #df is 10 rows shorter now

Unnamed: 0,index,Description,Level_1,Level_2,Level_3
0,0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF
...,...,...,...,...,...
10622,10644,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,A0E2
10623,10645,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,A0E2
10624,10646,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10625,10647,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,A0E2


## String processing function definition

Next step is to create a function that can apply some pre-processing to strings to clean and prep them for use in the following tf-idf step.

In [7]:
from nltk.stem import PorterStemmer
from nltk import ngrams
import string

def string_processor(text, n = 1):

    text = text.lower() 
    for char in string.punctuation:
        text = text.replace(char, "") #1 all lower-case and punctuation removed
    
    text = text.split(" ") 
    for i,word in enumerate(text):
        text[i] = PorterStemmer().stem(word) #2 stemmed
    
    tokens = list(ngrams(text, n))
    for i,token in enumerate(tokens):
        tokens[i] = " ".join(token) #3 Ngram tokenisation

    return tokens 

In [8]:
string_processor("Does or doesn't the processing function appear to work?", n = 3)
# a test string to make sure function working as intended

['doe or doesnt',
 'or doesnt the',
 'doesnt the process',
 'the process function',
 'process function appear',
 'function appear to',
 'appear to work']

## Creating TF-IDF feature matrix to train and test the models

In order to train the machine learning models that will attempt to automatically classify the text descriptions, they must be converted to a numerical format. 
This task dictates that the nature of that format is tf-idf or term frequency * inverse document frequency, which will create a numerical vector for each description, the aspects of which will train the classification model.

First, a list of the descriptions with the string processing and stemming applied is determined; then this is passed to a scikit learn method that produces the feature matrix.

In [9]:
documents = []

for sentence in df['Description']:
    documents.extend(string_processor(sentence, n=len(sentence.split(" "))))

documents[:3]
#Ngram set to length of each description to create list of documents for tfidf transform

['gerb cap help keep littl on head cov warm day long design coordin gerb layet item gre item gift giv',
 'newborn inf toddl boy hoodi jacket oshkosh b gosh versatil addit everyday ca wardrob mad comfi cotton blend zip front jacket includ fleec lin help keep warm weath day rib knit trim en snug fit',
 'tut ballet anym leap foxi fash ruffl tul toddl girl skirt tobi adorn satin bow pul waist cascad rippl puffi ruffl dant beat']

In [10]:
vec = TfidfVectorizer() 
# This is equivalent to CountVectorizer followed by TfidfTransformer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

fit = vec.fit_transform(documents)
tfidf_matrix = pd.DataFrame(fit.toarray())
tfidf_matrix.head()

#Here is the feature matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16213,16214,16215,16216,16217,16218,16219,16220,16221,16222
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Multinomial Naive Bayes classification of each level

The classification method I chose to go with was Multinomal Naive Bayes.  
It is a common method for text classification, and suitable for the purposes of this task.  
For information on this model see here: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

### Defining training and testing datasets

In [11]:
X= tfidf_matrix.copy() # Use the table of vectors above as numerical input for testing
y= df[['Level_1', 'Level_2', 'Level_3']] # Y values are subsequent related classes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #Split data into test&train, with ratio 0.3:0.7

In [12]:
# Resetting all indices as this is used later for record-tracking in the model creation and usage

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)


### Training the 3 levels of models

In [13]:
# Define model1 and save via pickle
mnb = MultinomialNB()
model1 = mnb.fit(X_train, y_train['Level_1'])
pickle.dump(model1, open('model1.pkl', 'wb'))

In [14]:
#Will be a similar process for level 2 and level 3 models, so have defined a function and applied to avoid repitition

def create_models(lvl = 2):
    level = 'Level_{}'.format(str(lvl))
    leveldwn = 'Level_{}'.format(str(lvl-1))

# for each unique value in the previous level...
    for cat in y_train[leveldwn].unique():

# filter a copy of the training datasets for each class in previous level, then train prediction model on this subset and save
        y_temp_train = y_train.copy()
        y_temp_train = y_temp_train[y_temp_train[leveldwn] == cat]

        X_temp_train = X_train.copy()
        X_temp_train = X_temp_train[X_temp_train.index.isin(y_temp_train.index)]

        model = mnb.fit(X_temp_train, y_temp_train[level])

# pickle this model and name after the level and class
        filestr = "model{}_{}.pkl".format(lvl, str(cat))
        pickle.dump(model, open(filestr, 'wb'))

create_models(2)
create_models(3)

## Make predictions and create a dataset from them

In [15]:
# Create results table to hold predictions and reference for accuracy assessment
results = pd.DataFrame(columns = ['Level_1_pred', 'Level_2_pred', 'Level_3_pred'])

# Load model one, predict, and record results
with open('model1.pkl', 'rb') as mod:
    model1 = pickle.load(mod)
results['Level_1_pred'] = model1.predict(X_test)

results

Unnamed: 0,Level_1_pred,Level_2_pred,Level_3_pred
0,69286F45,,
1,09BF5150,,
2,2CEC27F1,,
3,D410C91A,,
4,B092BA29,,
...,...,...,...
3184,35E04739,,
3185,AAC8EE56,,
3186,90A8B052,,
3187,09BF5150,,


In [17]:
def get_results(lvl = 2):
    level_pred = 'Level_{}_pred'.format(str(lvl))
    leveldwn_pred = 'Level_{}_pred'.format(str(lvl-1))

    for result in results[leveldwn_pred].unique():

        #index list where previous level prediction is current unique category
        ix_filt = [i for i, row in enumerate(results[leveldwn_pred]) if row == result] 

        #filter test data to be the subset where current unique category has been predicted
        test_data = X_test[X_test.index.isin(ix_filt)] 

        #load relevant model based on level and previously predicted values
        modstr = 'model{}_{}.pkl'.format(str(lvl), str(result))
        with open(modstr, 'rb') as mod2:
            model = pickle.load(mod2)
        preds = model.predict(test_data)

        #update results dataframe with current level predictions assigned to correct indicies
        lvlpreds = pd.DataFrame(data = preds, index = ix_filt, columns = [level_pred])
        results.update(lvlpreds) 

        #close current pickled model
        mod2.close() 

get_results(2)
get_results(3)

In [18]:
results

Unnamed: 0,Level_1_pred,Level_2_pred,Level_3_pred
0,69286F45,2D5A3,28A7
1,09BF5150,C7E19,D06E
2,2CEC27F1,BAE8A,2ABA
3,D410C91A,ACD06,33D1
4,B092BA29,5A8AB,AA6B
...,...,...,...
3184,35E04739,B2DB4,21DA
3185,AAC8EE56,9B69F,80C4
3186,90A8B052,C719A,A0E2
3187,09BF5150,C7E19,D06E


## Computing model accuracy per level

In [19]:
for level in ['Level_1', 'Level_2', 'Level_3']:
    lp = str(level) + "_pred"
    acc_test = y_test[level] == results[lp] #create boolean list of matches/mismatches
    accuracy = round( ( sum(acc_test) / len(acc_test) )*100 , 2) #calc % of correct matches
    
    print("Accuracy for " + str(level) + " model = " + str(accuracy) + "%" )

Accuracy for Level_1 model = 85.39%
Accuracy for Level_2 model = 71.9%
Accuracy for Level_3 model = 71.56%
