### Import Libraries

In [88]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance_matrix
from scipy.spatial.distance import pdist, squareform

import nltk
from nltk.stem import PorterStemmer

import re
import numpy as np
import pandas as pd
import pickle


### Load and explore the data (4 marks)

In [89]:
data = pd.read_csv('product-category-dataset-improved.csv')
df = pd.DataFrame(data)
df.describe()

# 15 level_1 classes # 36 level_2 classes #94 level_3 classes

Unnamed: 0,Description,Level_1,Level_2,Level_3
count,10627,10639,10639,10639
unique,9668,15,36,94
top,glory gorg col fing complet outfit express moo...,B092BA29,2D5A3,28A7
freq,24,900,797,332


In [90]:
df.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918


### Deal with Missing Data (4 marks)

In [91]:
# Check if data has missing values in the Description column

missing_descriptions_indices = df[df['Description'].isnull()].index.tolist()
print('There are', len(missing_descriptions_indices), 'missing indices')
missing_descriptions_indices

There are 12 missing indices


[1063, 3434, 3458, 7754, 7788, 7796, 7808, 7859, 7936, 7962, 7988, 8004]

In [92]:
# Remove missing descriptions rows from dataframe
df = df[df['Description'].notna()]
df.shape
#shape is 10627 rows which is 12 less than original 10639 so know we have dropped the correct amount of rows from the dataframe

(10627, 4)

## Create subset of data to workwith as dataset too large

In [93]:
df = df.sample(n = 8500)
df.reset_index(inplace=True, drop=True)
# check shape to make sure correct transformation
print(df.shape)
df.head()

(8500, 4)


Unnamed: 0,Description,Level_1,Level_2,Level_3
0,comfort polo shirt nee disappear chil hit air ...,4513C920,F4055,5B02
1,item feat long sleev hood ful zip left intern ...,57164AC1,7B638,2C26
2,rich col shim effect dram ey catch look includ...,3E1E0D78,9D9EE,818C
3,5 oz 15ml nail pol,D410C91A,ACD06,33D1
4,st catherin bologn med gold fil st catherin bo...,96F95EEC,36080,C563


### Drop Classes where the number of instances is < 10 (4 marks)

In [94]:
# Apply to Level_1 
print(df.Level_1.value_counts())
print('Number of Unique Level 1 Categories: ', df.Level_1.nunique())
# No classes have less than 10 instances

35E04739    722
AAC8EE56    721
B092BA29    710
2CEC27F1    709
57164AC1    701
09BF5150    642
EFEF723B    640
69286F45    631
3E1E0D78    467
4C3D8686    463
96F95EEC    458
4513C920    452
014303D1    404
90A8B052    395
D410C91A    385
Name: Level_1, dtype: int64
Number of Unique Level 1 Categories:  15


There are 15 level 1 classes all of which have more than 10 instances. 
No classes will be dropped from level 1.

In [95]:
# Apply to Level_2

# create mask based on value counts
mask_2 = df.Level_2.value_counts()
# apply mask to dataset
df = df[df['Level_2'].isin(mask_2.index[mask_2>9])]
print('Number of Unique Level 2 Categories: ', df.Level_2.nunique())

#confirm no classes left have fewer than 10 instances
df.Level_2.value_counts()

Number of Unique Level 2 Categories:  36


2D5A3    631
ACD06    385
C719A    378
9D9EE    374
BAE8A    368
B2DB4    365
9B69F    364
5A8AB    360
C7E19    359
914A1    357
390F1    357
CB803    357
74974    356
94728    353
375FE    350
ADAD6    341
7B638    337
A04D3    315
F4055    292
7AED7    224
02FA0    215
77F62    180
36080    143
223B2    107
E6162     93
E69F5     88
5E038     85
31FED     72
D5531     68
262E7     57
F824F     57
915D4     33
6C6B1     26
AF6B9     25
08960     17
0864A     11
Name: Level_2, dtype: int64

All remaining level 2 classes have more than 10 instances.

In [96]:
# Apply to Level_3

#create mask 
mask_3 = df.Level_3.value_counts()
#apply mask
df = df[df['Level_3'].isin(mask_3.index[mask_3>9])]

print('Number of Unique Level 3 Categories: ', df.Level_3.nunique())

# check value counts all above 10 instances
df.Level_3.value_counts()

Number of Unique Level 3 Categories:  94


28A7    258
2CFE    185
BB6B    180
AA6B    175
5912    173
       ... 
74C9     21
98A8     19
1000     17
D55B     16
96B8     11
Name: Level_3, Length: 94, dtype: int64

All remaining level 3 classes have more than 10 instances.

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [97]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
import re
snowball_stemmer = SnowballStemmer(language='english')
porter_stemmer = PorterStemmer()

nltk.download('punkt')


def scrub_words(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[_]','', text)
    return text

def process_text(text, n = 3):
    # 1. Convert text to lower case and remove all punctuation
    scrubbed_text =scrub_words(text)                
    
    # 2. Tokenize words
    token_words = word_tokenize(scrubbed_text)      

    #3. Apply stemming
    stem_words = [snowball_stemmer.stem(w) for w in token_words] 
    
    # 4. Apply Ngram Tokenisation
    n_grams = ngrams(stem_words, n)                 
    return [' '.join(grams) for grams in n_grams]

def process_text2(text):
    # 1. Convert text to lower case and remove all punctuation
    scrubbed_text =scrub_words(text)                
    
    # 2. Tokenize words
    token_words = word_tokenize(scrubbed_text)      

    #3. Apply stemming
    stem_words = [snowball_stemmer.stem(w) for w in token_words] 
    
    # 4. Apply Ngram Tokenisation
    return ' '.join(stem_words)

    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [98]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:")
process_text2("Here we're testing the process_text function, results are as follows:")

'here were test the processtext function result are as follow'

## Now let's apply TF-IDF to extract features from plain text (10 marks)

### Here you apply the process_text function to the Description column of the data
### Then you pass the results to the bag of words tranformer
### See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from itertools import chain

## Use TFIDF Vectorizer which combines the process of using Count Vectorizer followed and TFIDF Transformer into one Process

In [100]:
#
vectorizer = TfidfVectorizer(ngram_range=(3,3), preprocessor=process_text2, max_features=15000)

X = vectorizer.fit_transform(df['Description'].values)

tfidf_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())


In [101]:
print(tfidf_df.shape)
tfidf_df.head()

(8500, 15000)


Unnamed: 0,00 ct tgw,000 first print,000 photo 10,01number pag 192bind,01number pag bind,02 05number pag,03 13number pag,03 15number pag,04 01number pag,04 27number pag,...,zip pocket two,zip sid pocket,zip stash pocket,zip two front,ziploc brand dispo,zircon ston ring,zon afric also,zon ant col,zon babi us,zon provid gre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.136403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Check to see if they are the same

In [102]:
# tfidf_df_one = pd.DataFrame(first_descrip_vector.T.todense(), index=feature_names, columns=['tfidf'])
# # print(tfidf_df_one.sort_values(by=['tfidf'],ascending=False))

# tfidf_df2_one = pd.DataFrame(X2[0].T.todense(), index=feature_names, columns=['tfidf'])
# # print(tfidf_df2_one.sort_values(by=['tfidf'],ascending=False))


# if ((tfidf_df_one.values == tfidf_df2_one.values).all()):
#     print('Arrays are equal showing tfidf vectorizer returns same result')

### Now we can use `.transform` on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

After that you pass the result of the previous step to sklearn's TfidfTransformer  
which will convert them into a feature matrix  
See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

The resulting matrix is in sparse format, we can transform it into dense  
Code prepared for you so you can see what results look like

This is an example result, the matrix will contain lots of zero values, that is expected.  
Some values will be non-zero

    

## Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)


In [103]:
# combine dfs before creating test and train datasets

tfidf_df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True, drop=True)

data = pd.concat([df, tfidf_df], axis=1)

In [104]:
train, test = train_test_split(data, test_size=0.2, random_state=1811)

In [105]:
X_train = train.iloc[:, 4:]
y_train = train.iloc[:, 0: 4]
X_test = test.iloc[:, 4:]
y_test = test.iloc[:, 0: 4]

In [106]:
# Reset index in each dataframe 

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [107]:
# Take classes as separate columns 

class1_train = y_train['Level_1'].astype(str)
class1_test = y_test['Level_1'].astype(str)

class2_train = y_train['Level_2'].astype(str)
class2_test = y_test['Level_2'].astype(str)

class3_train = y_train['Level_3'].astype(str)
class3_test = y_test['Level_3'].astype(str)


## Model training for the three levels (8 marks)

naive bayes class

In [109]:
# Create and save model for level 1
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('accuracy: ', score)

accuracy:  0.6994117647058824


In [110]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

Accuracy:  0.7041176470588235


In [111]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier().fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

with open('level1.pk', 'wb') as cls:
    pickle.dump(classifier, cls)

Accuracy:  0.6717647058823529


The MultinomialNB Model classifier had the greatest accuarcy of the three classifiers tested so using that going forward for model training of class 2 and 3.

## Create and save models for level 2

In [121]:
# in this space you will be fitting models for the level 2 data 
#so you will want to split the test data into sub data for each classification e.g.

#get unique level 1 categories
level1cats = class1_train.unique()

lvl1_cat_indexes = []
lvl1_cat_indexes_test = []
lvl1_unique = []

for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    #get indexes for train data and test data
    a = list(class1_train[class1_train == cat].index)
    b = list(class1_test[class1_test == cat].index)
    lvl1_cat_indexes.append(a)
    lvl1_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class2_train.loc[a].nunique() == 1:
        unique_val = class2_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl1_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 1 category
    classifier = MultinomialNB()
    classifier.fit(X_train.loc[a], class2_train[a])

    score = classifier.score(X_test.loc[b], class2_test[b])
    print('\n Accuracy score for LVL 1 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)
    # limit = 5
    # if index == limit: 
    #     break

# cat 69286F45 unique val 2D5A3
# D410C91A => ACD06
lvl1_unique

Level 1 Category:  EFEF723B

 Accuracy score for LVL 1 CAT:  EFEF723B  SCORE:  0.852112676056338
Level 1 Category:  90A8B052

 Accuracy score for LVL 1 CAT:  90A8B052  SCORE:  1.0
Level 1 Category:  69286F45
ERROR: ONLY ONE UNIQUE VALUE: 2D5A3  SO SKIP MODEL CREATION 

Level 1 Category:  96F95EEC

 Accuracy score for LVL 1 CAT:  96F95EEC  SCORE:  0.8888888888888888
Level 1 Category:  4C3D8686

 Accuracy score for LVL 1 CAT:  4C3D8686  SCORE:  0.7835051546391752
Level 1 Category:  4513C920

 Accuracy score for LVL 1 CAT:  4513C920  SCORE:  0.7333333333333333
Level 1 Category:  35E04739

 Accuracy score for LVL 1 CAT:  35E04739  SCORE:  0.8223684210526315
Level 1 Category:  09BF5150

 Accuracy score for LVL 1 CAT:  09BF5150  SCORE:  0.664
Level 1 Category:  57164AC1

 Accuracy score for LVL 1 CAT:  57164AC1  SCORE:  0.7248322147651006
Level 1 Category:  AAC8EE56

 Accuracy score for LVL 1 CAT:  AAC8EE56  SCORE:  0.7894736842105263
Level 1 Category:  014303D1

 Accuracy score for LVL 1 CA

[['69286F45', '2D5A3'], ['D410C91A', 'ACD06']]

## Create and save models for level 3

In [122]:
# get unique level 2 categories
level2cats = class2_train.unique()

lvl2_cat_indexes = []
lvl2_cat_indexes_test = []
lvl2_unique = []
for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    #get indexes for train data and test data
    a = list(class2_train[class2_train == cat].index)
    b = list(class2_test[class2_test == cat].index)

    lvl2_cat_indexes.append(a)
    lvl2_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class3_train.loc[a].nunique() == 1:
        unique_val = class3_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl2_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 2 category
    classifier = MultinomialNB()
    classifier.fit(X_train.loc[a], class3_train[a])
    score = classifier.score(X_test.loc[b], class3_test[b])
    print('\n Accuracy score for LVL 2 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)
    # limit = 2
    # if index == limit: 
    #     break

len(lvl2_unique), len(level2cats)

Level 2 Category:  02FA0

 Accuracy score for LVL 2 CAT:  02FA0  SCORE:  0.35294117647058826
Level 2 Category:  C719A

 Accuracy score for LVL 2 CAT:  C719A  SCORE:  0.43548387096774194
Level 2 Category:  2D5A3

 Accuracy score for LVL 2 CAT:  2D5A3  SCORE:  0.4140625
Level 2 Category:  A04D3

 Accuracy score for LVL 2 CAT:  A04D3  SCORE:  0.3611111111111111
Level 2 Category:  74974

 Accuracy score for LVL 2 CAT:  74974  SCORE:  0.4084507042253521
Level 2 Category:  D5531
ERROR: ONLY ONE UNIQUE VALUE: 6253  SO SKIP MODEL CREATION 

Level 2 Category:  31FED

 Accuracy score for LVL 2 CAT:  31FED  SCORE:  0.75
Level 2 Category:  F4055

 Accuracy score for LVL 2 CAT:  F4055  SCORE:  0.22807017543859648
Level 2 Category:  390F1

 Accuracy score for LVL 2 CAT:  390F1  SCORE:  0.4230769230769231
Level 2 Category:  C7E19

 Accuracy score for LVL 2 CAT:  C7E19  SCORE:  0.33766233766233766
Level 2 Category:  94728

 Accuracy score for LVL 2 CAT:  94728  SCORE:  0.4473684210526316
Level 2 Categ

(11, 36)

## Predict the test set (8 marks)

In [123]:
# Creating an empty Dataframe with column names only
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1
level1_pred = model.predict(X_test)
results['Level1_Pred'] = level1_pred
results.head()


Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,35E04739,,
1,014303D1,,
2,35E04739,,
3,AAC8EE56,,
4,4513C920,,


In [124]:
# Predict Level 2

# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl1_unique = [element for sublist in lvl1_unique for element in sublist]

for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    # get indexes
    a = list(results[results['Level1_Pred']== cat].index)
    # if cat is in the arrayof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl1_unique:
        index = flat_lvl1_unique.index(cat)
        predicted = flat_lvl1_unique[index+1]
        print('Unique Category - no model')
        results['Level2_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level2_Pred'].loc[a] =  model.predict(X_test.loc[a])



Level 1 Category:  EFEF723B
Level 1 Category:  90A8B052
Level 1 Category:  69286F45
Unique Category - no model
Level 1 Category:  96F95EEC
Level 1 Category:  4C3D8686
Level 1 Category:  4513C920
Level 1 Category:  35E04739
Level 1 Category:  09BF5150
Level 1 Category:  57164AC1
Level 1 Category:  AAC8EE56
Level 1 Category:  014303D1
Level 1 Category:  2CEC27F1
Level 1 Category:  3E1E0D78
Level 1 Category:  B092BA29
Level 1 Category:  D410C91A
Unique Category - no model


In [125]:
# Predict Level 3

# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl2_unique = [element for sublist in lvl2_unique for element in sublist]

for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    # get indexes
    a = list(results[results['Level2_Pred']== cat].index)
    # if category is in the arraykof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl2_unique:
        index = flat_lvl2_unique.index(cat)
        predicted = flat_lvl2_unique[index+1]
        print('Unique Category - no model')
        results['Level3_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level3_Pred'].loc[a] =  model.predict(X_test.loc[a])

Level 2 Category:  02FA0
Level 2 Category:  C719A
Level 2 Category:  2D5A3
Level 2 Category:  A04D3
Level 2 Category:  74974
Level 2 Category:  D5531
Unique Category - no model
Level 2 Category:  31FED
Level 2 Category:  F4055
Level 2 Category:  390F1
Level 2 Category:  C7E19
Level 2 Category:  94728
Level 2 Category:  7B638
Level 2 Category:  CB803
Level 2 Category:  914A1
Level 2 Category:  7AED7
Level 2 Category:  BAE8A
Level 2 Category:  9D9EE
Level 2 Category:  ADAD6
Level 2 Category:  375FE
Level 2 Category:  5E038
Unique Category - no model
Level 2 Category:  77F62
Level 2 Category:  9B69F
Level 2 Category:  B2DB4
Level 2 Category:  F824F
Unique Category - no model
Level 2 Category:  ACD06
Level 2 Category:  5A8AB
Level 2 Category:  36080
Level 2 Category:  AF6B9
Unique Category - no model
Level 2 Category:  E69F5
Unique Category - no model
Level 2 Category:  262E7
Unique Category - no model
Level 2 Category:  E6162
Unique Category - no model
Level 2 Category:  6C6B1
Unique Cate

In [126]:
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,35E04739,B2DB4,21DA
1,014303D1,7AED7,BBA5
2,35E04739,390F1,6856
3,AAC8EE56,914A1,D97D
4,4513C920,F4055,5B02
...,...,...,...
1695,AAC8EE56,914A1,D97D
1696,4C3D8686,223B2,8FEF
1697,09BF5150,C7E19,D06E
1698,09BF5150,C7E19,D06E


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [127]:
# Level 1 accuracy
print('LEVEL 1 ACCURACY: ', accuracy_score(y_test['Level_1'], level1_pred))


LEVEL 1 ACCURACY:  0.6717647058823529


In [128]:
# Level 2 accuracy
print('LEVEL 2 ACCURACY: ', accuracy_score(y_test['Level_2'], results['Level2_Pred']))


LEVEL 2 ACCURACY:  0.6052941176470589


In [129]:
# Level 3 accuracy
print('LEVEL 3 ACCURACY: ', accuracy_score(y_test['Level_3'], results['Level3_Pred']))

LEVEL 3 ACCURACY:  0.26294117647058823


## Well done!