# Course Work - Statistics and Statistics Mining
## Patrick O'Neill




### Import Libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance_matrix
from scipy.spatial.distance import pdist, squareform

import nltk
from nltk.stem import PorterStemmer

import re
import numpy as np
import pandas as pd
import pickle


### Load and explore the data (4 marks)

In [2]:
data = pd.read_csv('product-category-dataset-improved.csv')
df = pd.DataFrame(data)
df.describe()

# 15 level_1 classes # 36 level_2 classes #94 level_3 classes

Unnamed: 0,Description,Level_1,Level_2,Level_3
count,10627,10639,10639,10639
unique,9668,15,36,94
top,glory gorg col fing complet outfit express moo...,B092BA29,2D5A3,28A7
freq,24,900,797,332


In [3]:
df.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918


### Deal with Missing Data (4 marks)

In [4]:
# Check if data has missing values in the Description column

missing_descriptions_indices = df[df['Description'].isnull()].index.tolist()

print('There are', len(missing_descriptions_indices), 'missing indices')


There are 12 missing indices


In [5]:
# Remove missing descriptions rows from dataframe
df = df[df['Description'].notna()]
df.shape


(10627, 4)

Shape is 10627 rows which is 12 less than original 10639 so know we have dropped the correct amount of rows from the dataframe

## Create subset of data to workwith as dataset too large

In [6]:
# take sample of 8500 of total count to reduce computational load for the classifiers

df = df.sample(n = 8500)
df.reset_index(inplace=True, drop=True)

# check shape to make sure correct transformation
print(df.shape)
df.head()

(8500, 4)


Unnamed: 0,Description,Level_1,Level_2,Level_3
0,sweet park bask sunshin princess picn girl whi...,AAC8EE56,914A1,F72B
1,styl jean comfy pj soft gen stretchable cut br...,EFEF723B,CB803,2C15
2,styl littl girl sur ad 17 piec toddl girl hair...,09BF5150,C7E19,FDCF
3,vert seam sculpt suppl leath scub jacket acc t...,57164AC1,94728,5912
4,find man ped target com revlon nail enamel del...,D410C91A,ACD06,33D1


### Drop Classes where the number of instances is < 10 (4 marks)

In [7]:
# Apply to Level_1 
print(df.Level_1.value_counts())
print('Number of Unique Level 1 Categories: ', df.Level_1.nunique())
# No classes have less than 10 instances

B092BA29    713
57164AC1    704
35E04739    702
AAC8EE56    698
2CEC27F1    674
09BF5150    646
EFEF723B    645
69286F45    632
96F95EEC    482
4C3D8686    480
3E1E0D78    453
4513C920    449
D410C91A    415
014303D1    406
90A8B052    401
Name: Level_1, dtype: int64
Number of Unique Level 1 Categories:  15


There are 15 level 1 classes all of which have more than 10 instances. 
No classes will be dropped from level 1.

In [8]:
# Apply to Level_2

# create mask based on value counts
mask_2 = df.Level_2.value_counts()
# apply mask to dataset
df = df[df['Level_2'].isin(mask_2.index[mask_2>9])]
print('Number of Unique Level 2 Categories: ', df.Level_2.nunique())

#confirm no classes left have fewer than 10 instances
df.Level_2.value_counts()

Number of Unique Level 2 Categories:  36


2D5A3    632
ACD06    415
C719A    381
74974    378
9D9EE    371
B2DB4    365
9B69F    362
5A8AB    359
375FE    354
94728    353
CB803    353
C7E19    350
BAE8A    349
A04D3    341
7B638    338
390F1    337
914A1    336
ADAD6    325
F4055    290
7AED7    219
02FA0    219
77F62    187
36080    141
223B2    102
E69F5     91
5E038     89
E6162     82
D5531     73
31FED     68
F824F     55
262E7     52
915D4     39
AF6B9     33
6C6B1     28
08960     20
0864A     13
Name: Level_2, dtype: int64

All remaining level 2 classes have more than 10 instances.

In [9]:
# Apply to Level_3

#create mask 
mask_3 = df.Level_3.value_counts()
#apply mask
df = df[df['Level_3'].isin(mask_3.index[mask_3>9])]

print('Number of Unique Level 3 Categories: ', df.Level_3.nunique())

# check value counts all above 10 instances
df.Level_3.value_counts()

Number of Unique Level 3 Categories:  94


28A7    251
2CFE    185
33D1    184
BB6B    182
62E8    181
       ... 
98A8     25
1000     20
74C9     16
D55B     16
96B8     13
Name: Level_3, Length: 94, dtype: int64

All remaining level 3 classes have more than 10 instances.

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [11]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
import re
snowball_stemmer = SnowballStemmer(language='english')
porter_stemmer = PorterStemmer()

nltk.download('punkt')


def scrub_words(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[_]','', text)
    return text

def process_text(text):
    # 1. Convert text to lower case and remove all punctuation
    scrubbed_text =scrub_words(text)                
    
    # 2. Tokenize words
    token_words = word_tokenize(scrubbed_text)      

    #3. Apply stemming
    stem_words = [snowball_stemmer.stem(w) for w in token_words] 
    
    # 4. Apply Ngram Tokenisation
    return ' '.join(stem_words)

    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Here is an example function call

process_text("Here we're testing the process_text function, results are as follows:")

'here were test the processtext function result are as follow'

## Apply TF-IDF to extract features from plain text (10 marks)


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from itertools import chain

## Use TFIDF Vectorizer which combines the process of using Count Vectorizer followed and TFIDF Transformer into one Process

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(3,3), preprocessor=process_text, max_features=15000)

X = vectorizer.fit_transform(df['Description'].values)

tfidf_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())


In [15]:
print(tfidf_df.shape)
tfidf_df.head()

(8500, 15000)


Unnamed: 0,00 ct tgw,000 first print,000 photo 10,01number pag 192bind,01number pag bind,03 01number pag,03 15number pag,03 20number pag,04 01number pag,04 03number pag,...,zip princess seam,zip pullov part,zip sid pocket,zip slid comfort,zip stash pocket,zip two front,zon afric also,zon ant col,zon babi us,zon provid gre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)


In [16]:
# combine dfs before creating test and train datasets

tfidf_df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True, drop=True)

data = pd.concat([df, tfidf_df], axis=1)

In [17]:
train, test = train_test_split(data, test_size=0.2, random_state=1811)

In [18]:
X_train = train.iloc[:, 4:]
y_train = train.iloc[:, 0: 4]
X_test = test.iloc[:, 4:]
y_test = test.iloc[:, 0: 4]

In [19]:
# Reset index in each dataframe 

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [20]:
# Take classes as separate columns 

class1_train = y_train['Level_1'].astype(str)
class1_test = y_test['Level_1'].astype(str)

class2_train = y_train['Level_2'].astype(str)
class2_test = y_test['Level_2'].astype(str)

class3_train = y_train['Level_3'].astype(str)
class3_test = y_test['Level_3'].astype(str)


## Model training for the three levels (8 marks)


In [21]:
# Create and save model for level 1
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('accuracy: ', score)

accuracy:  0.6894117647058824


In [22]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

Accuracy:  0.691764705882353


In [23]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier().fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

with open('level1.pk', 'wb') as cls:
    pickle.dump(classifier, cls)

Accuracy:  0.6823529411764706


The MultinomialNB Model classifier had the greatest accuarcy of the three classifiers tested so using that going forward for model training of class 2 and 3.

## Create and save models for level 2

In [24]:
# in this space you will be fitting models for the level 2 data 
#so you will want to split the test data into sub data for each classification e.g.

#get unique level 1 categories
level1cats = class1_train.unique()

lvl1_cat_indexes = []
lvl1_cat_indexes_test = []
lvl1_unique = []

for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    #get indexes for train data and test data
    a = list(class1_train[class1_train == cat].index)
    b = list(class1_test[class1_test == cat].index)
    lvl1_cat_indexes.append(a)
    lvl1_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class2_train.loc[a].nunique() == 1:
        unique_val = class2_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl1_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 1 category
    classifier = MultinomialNB()
    classifier.fit(X_train.loc[a], class2_train[a])

    score = classifier.score(X_test.loc[b], class2_test[b])
    print('\n Accuracy score for LVL 1 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)


Level 1 Category:  90A8B052

 Accuracy score for LVL 1 CAT:  90A8B052  SCORE:  1.0
Level 1 Category:  AAC8EE56

 Accuracy score for LVL 1 CAT:  AAC8EE56  SCORE:  0.7727272727272727
Level 1 Category:  96F95EEC

 Accuracy score for LVL 1 CAT:  96F95EEC  SCORE:  0.7674418604651163
Level 1 Category:  09BF5150

 Accuracy score for LVL 1 CAT:  09BF5150  SCORE:  0.6
Level 1 Category:  35E04739

 Accuracy score for LVL 1 CAT:  35E04739  SCORE:  0.7354838709677419
Level 1 Category:  B092BA29

 Accuracy score for LVL 1 CAT:  B092BA29  SCORE:  0.6159420289855072
Level 1 Category:  2CEC27F1

 Accuracy score for LVL 1 CAT:  2CEC27F1  SCORE:  0.9296875
Level 1 Category:  EFEF723B

 Accuracy score for LVL 1 CAT:  EFEF723B  SCORE:  0.7364341085271318
Level 1 Category:  4513C920

 Accuracy score for LVL 1 CAT:  4513C920  SCORE:  0.803921568627451
Level 1 Category:  69286F45
ERROR: ONLY ONE UNIQUE VALUE: 2D5A3  SO SKIP MODEL CREATION 

Level 1 Category:  D410C91A
ERROR: ONLY ONE UNIQUE VALUE: ACD06  SO 

## Create and save models for level 3

In [25]:
# get unique level 2 categories
level2cats = class2_train.unique()

lvl2_cat_indexes = []
lvl2_cat_indexes_test = []
lvl2_unique = []
for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    #get indexes for train data and test data
    a = list(class2_train[class2_train == cat].index)
    b = list(class2_test[class2_test == cat].index)

    lvl2_cat_indexes.append(a)
    lvl2_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class3_train.loc[a].nunique() == 1:
        unique_val = class3_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl2_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 2 category
    classifier = MultinomialNB()
    classifier.fit(X_train.loc[a], class3_train[a])
    score = classifier.score(X_test.loc[b], class3_test[b])
    print('\n Accuracy score for LVL 2 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)


Level 2 Category:  C719A

 Accuracy score for LVL 2 CAT:  C719A  SCORE:  0.36764705882352944
Level 2 Category:  9B69F

 Accuracy score for LVL 2 CAT:  9B69F  SCORE:  0.4098360655737705
Level 2 Category:  A04D3

 Accuracy score for LVL 2 CAT:  A04D3  SCORE:  0.3225806451612903
Level 2 Category:  C7E19

 Accuracy score for LVL 2 CAT:  C7E19  SCORE:  0.4444444444444444
Level 2 Category:  B2DB4

 Accuracy score for LVL 2 CAT:  B2DB4  SCORE:  0.34210526315789475
Level 2 Category:  5A8AB

 Accuracy score for LVL 2 CAT:  5A8AB  SCORE:  0.44285714285714284
Level 2 Category:  BAE8A

 Accuracy score for LVL 2 CAT:  BAE8A  SCORE:  0.35135135135135137
Level 2 Category:  D5531
ERROR: ONLY ONE UNIQUE VALUE: 6253  SO SKIP MODEL CREATION 

Level 2 Category:  F4055

 Accuracy score for LVL 2 CAT:  F4055  SCORE:  0.28378378378378377
Level 2 Category:  2D5A3

 Accuracy score for LVL 2 CAT:  2D5A3  SCORE:  0.32116788321167883
Level 2 Category:  CB803

 Accuracy score for LVL 2 CAT:  CB803  SCORE:  0.35937

## Predict the test set (8 marks)

## Predict Level 1

In [26]:
# Creating an empty Dataframe with column names only
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1
level1_pred = model.predict(X_test)
results['Level1_Pred'] = level1_pred
results.head()


Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,3E1E0D78,,
1,3E1E0D78,,
2,3E1E0D78,,
3,3E1E0D78,,
4,57164AC1,,


## Predict Level 2

In [27]:
# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl1_unique = [element for sublist in lvl1_unique for element in sublist]

for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    # get indexes
    a = list(results[results['Level1_Pred']== cat].index)
    # if cat is in the arrayof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl1_unique:
        index = flat_lvl1_unique.index(cat)
        predicted = flat_lvl1_unique[index+1]
        print('Unique Category - no model')
        results['Level2_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level2_Pred'].loc[a] =  model.predict(X_test.loc[a])



Level 1 Category:  90A8B052
Level 1 Category:  AAC8EE56
Level 1 Category:  96F95EEC
Level 1 Category:  09BF5150
Level 1 Category:  35E04739
Level 1 Category:  B092BA29
Level 1 Category:  2CEC27F1
Level 1 Category:  EFEF723B
Level 1 Category:  4513C920
Level 1 Category:  69286F45
Unique Category - no model
Level 1 Category:  D410C91A
Unique Category - no model
Level 1 Category:  57164AC1
Level 1 Category:  014303D1
Level 1 Category:  4C3D8686
Level 1 Category:  3E1E0D78


## Predict Level 3

In [28]:
# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl2_unique = [element for sublist in lvl2_unique for element in sublist]

for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    # get indexes
    a = list(results[results['Level2_Pred']== cat].index)
    # if category is in the arraykof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl2_unique:
        index = flat_lvl2_unique.index(cat)
        predicted = flat_lvl2_unique[index+1]
        print('Unique Category - no model')
        results['Level3_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level3_Pred'].loc[a] =  model.predict(X_test.loc[a])

Level 2 Category:  C719A
Level 2 Category:  9B69F
Level 2 Category:  A04D3
Level 2 Category:  C7E19
Level 2 Category:  B2DB4
Level 2 Category:  5A8AB
Level 2 Category:  BAE8A
Level 2 Category:  D5531
Unique Category - no model
Level 2 Category:  F4055
Level 2 Category:  2D5A3
Level 2 Category:  CB803
Level 2 Category:  ACD06
Level 2 Category:  7B638
Level 2 Category:  AF6B9
Unique Category - no model
Level 2 Category:  7AED7
Level 2 Category:  ADAD6
Level 2 Category:  E69F5
Unique Category - no model
Level 2 Category:  77F62
Level 2 Category:  F824F
Unique Category - no model
Level 2 Category:  390F1
Level 2 Category:  31FED
Level 2 Category:  223B2
Level 2 Category:  375FE
Level 2 Category:  08960
Unique Category - no model
Level 2 Category:  914A1
Level 2 Category:  36080
Level 2 Category:  9D9EE
Level 2 Category:  94728
Level 2 Category:  74974
Level 2 Category:  915D4
Unique Category - no model
Level 2 Category:  02FA0
Level 2 Category:  E6162
Unique Category - no model
Level 2 Cat

In [29]:
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,3E1E0D78,9D9EE,05A0
1,3E1E0D78,9D9EE,05A0
2,3E1E0D78,9D9EE,05A0
3,3E1E0D78,9D9EE,05A0
4,57164AC1,7B638,2C26
...,...,...,...
1695,3E1E0D78,9D9EE,05A0
1696,3E1E0D78,9D9EE,05A0
1697,3E1E0D78,9D9EE,05A0
1698,4C3D8686,74974,DAEA


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [30]:
# Level 1 accuracy
print('LEVEL 1 ACCURACY: ', accuracy_score(y_test['Level_1'], level1_pred))


LEVEL 1 ACCURACY:  0.6823529411764706


In [31]:
# Level 2 accuracy
print('LEVEL 2 ACCURACY: ', accuracy_score(y_test['Level_2'], results['Level2_Pred']))


LEVEL 2 ACCURACY:  0.6064705882352941


In [32]:
# Level 3 accuracy
print('LEVEL 3 ACCURACY: ', accuracy_score(y_test['Level_3'], results['Level3_Pred']))

LEVEL 3 ACCURACY:  0.23294117647058823


## Well done!