### Import Libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance_matrix
from scipy.spatial.distance import pdist, squareform

import nltk
from nltk.stem import PorterStemmer

import re
import numpy as np
import pandas as pd
import pickle


### Load and explore the data (4 marks)

In [2]:
data = pd.read_csv('product-cat-dataset.csv')
df = pd.DataFrame(data)
df.describe()

# 15 level_1 classes # 39 level_2 classes # 43 level_3 classes

Unnamed: 0,Description,Level_1,Level_2,Level_3
count,10637,10649,10649,10649
unique,9677,15,39,43
top,glory gorg col fing complet outfit express moo...,B092BA29,2D5A3,28A7
freq,24,900,797,797


In [3]:
df.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF


### Deal with Missing Data (4 marks)

In [4]:
# Check if data has missing values in the Description column
# missing = df['Description'].isnull().index.tolist()

missing_descriptions_indices = df[df['Description'].isnull()].index.tolist()
missing_descriptions_indices
#12 rows missing descriptions

[1063, 3435, 3459, 7763, 7797, 7805, 7817, 7868, 7945, 7971, 7997, 8013]

In [5]:
# Remove missing descriptions rows from dataframe
# df.dropna()
df = df[df['Description'].notna()]
df.shape
#shape is 10637 rows which is 12 less than original 10649 so know we have dropped the correct amount of rows from the dataframe

(10637, 4)

## Create subset of data to workwith as dataset too large

In [6]:
df = df.sample(n = 8000)
df.reset_index(inplace=True, drop=True)
# check shape to make sure correct transformation
print(df.shape)
df.head()

(8000, 4)


Unnamed: 0,Description,Level_1,Level_2,Level_3
0,wrap fash warm frosty air ar zeroxpos versatil...,57164AC1,94728,5912
1,exud feminin wear covington wom sleeveless blo...,2CEC27F1,BAE8A,2ABA
2,prefect smokey look on conveny cas easy beauty...,3E1E0D78,9D9EE,05A0
3,mak diap chang enjoy baby comfort warm wip sto...,4C3D8686,74974,62E8
4,pres recip us fruit veget protein food soy pro...,B092BA29,375FE,1F61


### Drop Classes where the number of instances is < 10 (4 marks)

In [7]:
# Apply to Level_1 
print(df.Level_1.value_counts())
print('Number of Unique Level 1 Categories: ', df.Level_1.nunique())
# 15 total categories for Level 1 - no classes with less than 10 instances

AAC8EE56    686
B092BA29    667
35E04739    663
2CEC27F1    661
57164AC1    657
EFEF723B    596
69286F45    591
09BF5150    588
4C3D8686    455
3E1E0D78    450
96F95EEC    429
4513C920    422
D410C91A    383
90A8B052    379
014303D1    373
Name: Level_1, dtype: int64
Number of Unique Level 1 Categories:  15


In [8]:
# Apply to Level_2

# create mask based on value counts
mask_2 = df.Level_2.value_counts()
# apply mask to dataset
df = df[df['Level_2'].isin(mask_2.index[mask_2>9])]
print('Number of Unique Level 2 Categories: ', df.Level_2.nunique())

df.Level_2.value_counts()

Number of Unique Level 2 Categories:  36


2D5A3    591
ACD06    383
C719A    360
9D9EE    357
914A1    348
74974    344
BAE8A    341
5A8AB    339
9B69F    338
B2DB4    338
94728    329
375FE    328
CB803    325
390F1    321
ADAD6    320
C7E19    319
7B638    314
A04D3    295
F4055    274
7AED7    211
02FA0    208
77F62    162
36080    134
223B2    111
E6162     93
5E038     86
E69F5     81
31FED     67
D5531     63
F824F     50
262E7     46
915D4     32
AF6B9     29
6C6B1     26
08960     19
0864A     13
Name: Level_2, dtype: int64

In [9]:
# Apply to Level_3

#create mask 
mask_3 = df.Level_3.value_counts()
#apply mask
df = df[df['Level_3'].isin(mask_3.index[mask_3>9])]

print('Number of Unique Level 3 Categories: ', df.Level_3.nunique())

# check value counts all above 10 instances
df.Level_3.value_counts()

Number of Unique Level 3 Categories:  38


28A7    591
33D1    383
A0E2    360
05A0    357
D97D    348
62E8    344
2ABA    341
AA6B    339
21DA    338
80C4    338
1F61    328
5912    328
627D    325
6856    321
98CF    320
D06E    319
0F8B    314
C5B4    295
6539    211
078B    208
5AE1    162
1F75    149
C563    134
7C00    125
F213    111
2E14     93
6BE5     86
DDD5     81
6253     63
7288     50
29B3     46
3DD3     45
A2FA     32
A104     29
3AAD     26
215F     22
1000     19
96B8     13
Name: Level_3, dtype: int64

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [10]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
import re
snowball_stemmer = SnowballStemmer(language='english')
porter_stemmer = PorterStemmer()

nltk.download('punkt')

# def my_preprocessor(text):
#     text=text.lower() #lowercase text (done be default if don't use a custom preprocessor)
#     text=re.sub("\\W"," ",text) # remove special chars
#     text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
#     # stem words
#     words=re.split("\\s+",text)
#     stemmed_words=[porter_stemmer.stem(word=word) for word in words]
#     return ' '.join(stemmed_words)

# def scrub_words(text):
#     """Basic cleaning of texts."""
    
#     # remove html markup
#     text=re.sub("(<.*?>)","",text)
    
#     #remove non-ascii and digits
#     text=re.sub("(\\W|\\d)"," ",text)
    
#     #remove whitespace
#     text=text.strip()
#     return text

def scrub_words(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[_]','', text)
    return text

def process_text(text, n = 3):
    # 1. Convert text to lower case and remove all punctuation
    scrubbed_text =scrub_words(text)                
    
    # 2. Tokenize words
    token_words = word_tokenize(scrubbed_text)      

    #3. Apply stemming
    stem_words = [snowball_stemmer.stem(w) for w in token_words] 
    
    # 4. Apply Ngram Tokenisation
    n_grams = ngrams(stem_words, n)                 
    return [' '.join(grams) for grams in n_grams]

def process_text2(text):
    # 1. Convert text to lower case and remove all punctuation
    scrubbed_text =scrub_words(text)                
    
    # 2. Tokenize words
    token_words = word_tokenize(scrubbed_text)      

    #3. Apply stemming
    stem_words = [snowball_stemmer.stem(w) for w in token_words] 
    
    # 4. Apply Ngram Tokenisation
    return ' '.join(stem_words)

    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:")
process_text2("Here we're testing the process_text function, results are as follows:")

'here were test the processtext function result are as follow'

### Now let's apply TF-IDF to extract features from plain text (10 marks)
### Might take a while...
### Here you apply the process_text function to the Description column of the data
### Then you pass the results to the bag of words tranformer
### See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [12]:

# df['bow'] = df['Description'].apply(lambda x: process_text(x, 3))
# df.head()

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from itertools import chain

## Use Count Vectorizer to get Bag of Words with ngram of 3 words 

In [14]:

# descriptions = df['Description'].values

# cv = CountVectorizer(preprocessor=process_text2, ngram_range=(3,3), max_features=15000)
# X = cv.fit_transform(descriptions)
# count_vector = cv.transform(descriptions)

In [16]:
# X.shape # => 10627, 10000 
#10,627 documenst and vocab of 10,000
# count_vector.shape


## Calculate TFIDF Scores and add to DF

In [17]:
# tfidf_transformer = TfidfTransformer()  
# tfidf_transformer.fit(X)

# df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=['idf_weights'])
# df_idf.sort_values(by=['idf_weights'], ascending=False)

# #compute tfidf scores
# tf_idf_vector = tfidf_transformer.transform(count_vector)
# feature_names = cv.get_feature_names()

# #get tfidf vector for fist doc
# first_descrip_vector = tf_idf_vector[0]

# #print the scores for first description
# tfidf_df_one = pd.DataFrame(first_descrip_vector.T.todense(), index=feature_names, columns=['tfidf'])
# tfidf_df_one.sort_values(by=['tfidf'],ascending=False)

# #print scores for all descriptions
# tfidf_df = pd.DataFrame(tf_idf_vector.todense(), columns=feature_names)
# tfidf_df.head()
# tfidf_df.shape
# #10,000 vocabs columns / 10627 descriptions rows


## Use TFIDF Vectorizer to do same thing as combining Count Vectorizer and TFIDF Transformer

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(3,3), preprocessor=process_text2, max_features=15000)
X = vectorizer.fit_transform(df['Description'].values)

tfidf_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())


In [19]:
print(tfidf_df.shape)
tfidf_df.head()

(7994, 15000)


Unnamed: 0,00 ct tgw,000 first print,000 photo 10,01number pag 192bind,01number pag bind,02 05number pag,03 01number pag,04 01number pag,05 01number pag,056 car bril,...,zip pullov part,zip real fur,zip sid pocket,zip stash pocket,zip two front,ziploc brand dispo,zon afric also,zon ant col,zon babi us,zon provid gre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Check to see if they are the same

In [20]:
# tfidf_df_one = pd.DataFrame(first_descrip_vector.T.todense(), index=feature_names, columns=['tfidf'])
# # print(tfidf_df_one.sort_values(by=['tfidf'],ascending=False))

# tfidf_df2_one = pd.DataFrame(X2[0].T.todense(), index=feature_names, columns=['tfidf'])
# # print(tfidf_df2_one.sort_values(by=['tfidf'],ascending=False))


# if ((tfidf_df_one.values == tfidf_df2_one.values).all()):
#     print('Arrays are equal showing tfidf vectorizer returns same result')

### Now we can use `.transform` on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

After that you pass the result of the previous step to sklearn's TfidfTransformer  
which will convert them into a feature matrix  
See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

The resulting matrix is in sparse format, we can transform it into dense  
Code prepared for you so you can see what results look like

This is an example result, the matrix will contain lots of zero values, that is expected.  
Some values will be non-zero

    

## Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)


In [21]:
# combine dfs before creating test and train datasets
## REMOVE THESE COMMENTS BEFORE SUBMISSION
# === not sure if actually need to do this ===
#probably using same random_state paramater would result in same rows being used but whatever

tfidf_df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True, drop=True)

data = pd.concat([df, tfidf_df], axis=1)



In [22]:
train, test = train_test_split(data, test_size=0.2, random_state=1811)

In [23]:
X_train = train.iloc[:, 4:]
y_train = train.iloc[:, 0: 4]
X_test = test.iloc[:, 4:]
y_test = test.iloc[:, 0: 4]

In [24]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [25]:
# You might need to take classes as separate columns (depends on you how you do things)
from sklearn.preprocessing import MinMaxScaler

class1_train = y_train['Level_1'].astype(str)
class1_test = y_test['Level_1'].astype(str)

class2_train = y_train['Level_2'].astype(str)
class2_test = y_test['Level_2'].astype(str)

class3_train = y_train['Level_3'].astype(str)
class3_test = y_test['Level_3'].astype(str)

# min_max_scaler = MinMaxScaler()

# X_train_min_max = min_max_scaler.fit_transform(X_train)
# X_test_min_max = min_max_scaler.fit_transform(X_test)

# y_train[y_train['Level_1'].isnull()].index.tolist()
# 
# missing_descriptions_indices = df[df['Description'].isnull()].index.tolist()

(3988,)

## Model training for the three levels (8 marks)

naive bayes class

In [26]:
# nb = GuassianNB()
# nb.fit(X, class1)
# with open('level1.pk') as l1clf:
#   pickle.dump(nb, l1clf)

In [27]:
# Create and save model for level 1
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('accuracy: ', score)

KeyboardInterrupt: 

In [38]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

Accuracy:  0.6735459662288931


In [39]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier().fit(X_train, class1_train)
score = classifier.score(X_test, class1_test)
print('Accuracy: ', score)

with open('level1.pk', 'wb') as cls:
    pickle.dump(classifier, cls)

Accuracy:  0.724202626641651


The Linear Model classifier had the greatest accuarcy of the three classifiers tested so using that going forward for model training of class 2 and 3.

## Create and save models for level 2

In [40]:
# in this space you will be fitting models for the level 2 data 
#so you will want to split the test data into sub data for each classification e.g.

#get unique level 1 categories
level1cats = class1_train.unique()

lvl1_cat_indexes = []
lvl1_cat_indexes_test = []
lvl1_unique = []

for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    #get indexes for train data and test data
    a = list(class1_train[class1_train == cat].index)
    b = list(class1_test[class1_test == cat].index)
    lvl1_cat_indexes.append(a)
    lvl1_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class2_train.loc[a].nunique() == 1:
        unique_val = class2_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl1_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 1 category
    classifier = SGDClassifier()
    classifier.fit(X_train.loc[a], class2_train[a])

    score = classifier.score(X_test.loc[b], class2_test[b])
    print('\n Accuracy score for LVL 1 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)
    # limit = 5
    # if index == limit: 
    #     break

# cat 69286F45 unique val 2D5A3
# D410C91A => ACD06
lvl1_unique

Level 1 Category:  3E1E0D78

 Accuracy score for LVL 1 CAT:  3E1E0D78  SCORE:  0.85
Level 1 Category:  2CEC27F1

 Accuracy score for LVL 1 CAT:  2CEC27F1  SCORE:  0.905982905982906
Level 1 Category:  D410C91A
ERROR: ONLY ONE UNIQUE VALUE: ACD06  SO SKIP MODEL CREATION 

Level 1 Category:  014303D1

 Accuracy score for LVL 1 CAT:  014303D1  SCORE:  0.8831168831168831
Level 1 Category:  AAC8EE56

 Accuracy score for LVL 1 CAT:  AAC8EE56  SCORE:  0.8142857142857143
Level 1 Category:  4513C920

 Accuracy score for LVL 1 CAT:  4513C920  SCORE:  0.9191919191919192
Level 1 Category:  B092BA29

 Accuracy score for LVL 1 CAT:  B092BA29  SCORE:  0.55
Level 1 Category:  96F95EEC

 Accuracy score for LVL 1 CAT:  96F95EEC  SCORE:  0.8041237113402062
Level 1 Category:  09BF5150

 Accuracy score for LVL 1 CAT:  09BF5150  SCORE:  0.6666666666666666
Level 1 Category:  35E04739

 Accuracy score for LVL 1 CAT:  35E04739  SCORE:  0.8
Level 1 Category:  57164AC1

 Accuracy score for LVL 1 CAT:  57164AC1  S

[['D410C91A', 'ACD06'], ['69286F45', '2D5A3']]

## Create and save models for level 3

In [41]:
# get unique level 2 categories
level2cats = class2_train.unique()

lvl2_cat_indexes = []
lvl2_cat_indexes_test = []
lvl2_unique = []
for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    #get indexes for train data and test data
    a = list(class2_train[class2_train == cat].index)
    b = list(class2_test[class2_test == cat].index)

    lvl2_cat_indexes.append(a)
    lvl2_cat_indexes_test.append(b)

    # some class 2 data only has one unique value for a particular level 1 category so can't create a model
    if class3_train.loc[a].nunique() == 1:
        unique_val = class3_train.loc[a].unique()[0]
        print('ERROR: ONLY ONE UNIQUE VALUE:', unique_val, ' SO SKIP MODEL CREATION \n')
        #put values into array for prediction later
        lvl2_unique.append([cat, unique_val])
        continue
    #create model with train data for unique level 2 category
    classifier = SGDClassifier()
    classifier.fit(X_train.loc[a], class3_train[a])
    score = classifier.score(X_test.loc[b], class3_test[b])
    print('\n Accuracy score for LVL 2 CAT: ', cat, ' SCORE: ', score)

    #save model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'wb') as cls: 
        pickle.dump(classifier, cls)
    # limit = 2
    # if index == limit: 
    #     break

len(lvl2_unique), len(level2cats)

Level 2 Category:  9D9EE
ERROR: ONLY ONE UNIQUE VALUE: 05A0  SO SKIP MODEL CREATION 

Level 2 Category:  ADAD6
ERROR: ONLY ONE UNIQUE VALUE: 98CF  SO SKIP MODEL CREATION 

Level 2 Category:  ACD06
ERROR: ONLY ONE UNIQUE VALUE: 33D1  SO SKIP MODEL CREATION 

Level 2 Category:  7AED7
ERROR: ONLY ONE UNIQUE VALUE: 6539  SO SKIP MODEL CREATION 

Level 2 Category:  914A1
ERROR: ONLY ONE UNIQUE VALUE: D97D  SO SKIP MODEL CREATION 

Level 2 Category:  BAE8A
ERROR: ONLY ONE UNIQUE VALUE: 2ABA  SO SKIP MODEL CREATION 

Level 2 Category:  31FED

 Accuracy score for LVL 2 CAT:  31FED  SCORE:  0.7692307692307693
Level 2 Category:  5A8AB
ERROR: ONLY ONE UNIQUE VALUE: AA6B  SO SKIP MODEL CREATION 

Level 2 Category:  375FE
ERROR: ONLY ONE UNIQUE VALUE: 1F61  SO SKIP MODEL CREATION 

Level 2 Category:  36080
ERROR: ONLY ONE UNIQUE VALUE: C563  SO SKIP MODEL CREATION 

Level 2 Category:  F824F
ERROR: ONLY ONE UNIQUE VALUE: 7288  SO SKIP MODEL CREATION 

Level 2 Category:  B2DB4
ERROR: ONLY ONE UNIQUE 

(34, 36)

## Predict the test set (8 marks)

In [42]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)
level1_pred = model.predict(X_test)
results['Level1_Pred'] = level1_pred
results.head()


Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,90A8B052,,
1,B092BA29,,
2,EFEF723B,,
3,96F95EEC,,
4,3E1E0D78,,


In [43]:
# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl1_unique = [element for sublist in lvl1_unique for element in sublist]
print(flat_lvl1_unique)
for index, cat in enumerate(level1cats): 
    print('Level 1 Category: ', cat)
    # get indexes
    a = list(results[results['Level1_Pred']== cat].index)
    # if cat is in the arrayof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl1_unique:
        index = flat_lvl1_unique.index(cat)
        predicted = flat_lvl1_unique[index+1]
        print('Unique Category - no model')
        results['Level2_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level2_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level2_Pred'].loc[a] =  model.predict(X_test.loc[a])
    # print(model.predict(X_test.loc[a]))
    # limit = 2
    # if index == limit: 
    #     break



['D410C91A', 'ACD06', '69286F45', '2D5A3']
Level 1 Category:  3E1E0D78
Level 1 Category:  2CEC27F1
Level 1 Category:  D410C91A
Unique Category - no model
Level 1 Category:  014303D1
Level 1 Category:  AAC8EE56
Level 1 Category:  4513C920
Level 1 Category:  B092BA29
Level 1 Category:  96F95EEC
Level 1 Category:  09BF5150
Level 1 Category:  35E04739
Level 1 Category:  57164AC1
Level 1 Category:  EFEF723B
Level 1 Category:  4C3D8686
Level 1 Category:  69286F45
Unique Category - no model
Level 1 Category:  90A8B052


In [44]:
# for each category in level 1 predictions => use that plus the model for that category to predict level 2

flat_lvl2_unique = [element for sublist in lvl2_unique for element in sublist]
# print(flat_lvl1_unique)
for index, cat in enumerate(level2cats): 
    print('Level 2 Category: ', cat)
    # get indexes
    a = list(results[results['Level2_Pred']== cat].index)
    # if cat is in the arrayof lvl1_unique => set predicted values to its pair
    if cat in flat_lvl2_unique:
        index = flat_lvl2_unique.index(cat)
        predicted = flat_lvl2_unique[index+1]
        print('Unique Category - no model')
        results['Level3_Pred'].loc[a] =  predicted
        continue
    # get model
    model_name = 'level3_' + cat + '.pk'
    with open(model_name, 'rb') as nb:
        model = pickle.load(nb)
    results['Level3_Pred'].loc[a] =  model.predict(X_test.loc[a])
    # print(model.predict(X_test.loc[a]))
    # limit = 2
    # if index == limit: 
    #     break

Level 2 Category:  9D9EE
Unique Category - no model
Level 2 Category:  ADAD6
Unique Category - no model
Level 2 Category:  ACD06
Unique Category - no model
Level 2 Category:  7AED7
Unique Category - no model
Level 2 Category:  914A1
Unique Category - no model
Level 2 Category:  BAE8A
Unique Category - no model
Level 2 Category:  31FED
Level 2 Category:  5A8AB
Unique Category - no model
Level 2 Category:  375FE
Unique Category - no model
Level 2 Category:  36080
Unique Category - no model
Level 2 Category:  F824F
Unique Category - no model
Level 2 Category:  B2DB4
Unique Category - no model
Level 2 Category:  C7E19
Unique Category - no model
Level 2 Category:  7B638
Unique Category - no model
Level 2 Category:  02FA0
Unique Category - no model
Level 2 Category:  74974
Unique Category - no model
Level 2 Category:  77F62
Unique Category - no model
Level 2 Category:  9B69F
Unique Category - no model
Level 2 Category:  94728
Unique Category - no model
Level 2 Category:  F4055
Level 2 Catego

In [359]:
# results['Level3_Pred'].value_counts()


In [45]:
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,90A8B052,08960,1000
1,B092BA29,5A8AB,AA6B
2,EFEF723B,CB803,627D
3,96F95EEC,A04D3,C5B4
4,3E1E0D78,E6162,2E14
...,...,...,...
1594,014303D1,7AED7,6539
1595,57164AC1,7B638,0F8B
1596,D410C91A,ACD06,33D1
1597,D410C91A,ACD06,33D1


In [27]:
## After you add the predictions to the results dataframe
## they should look like this
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,2CEC27F1,BAE8A,2ABA
1,2CEC27F1,BAE8A,2ABA
2,09BF5150,C7E19,D06E
3,4513C920,F4055,7C00
4,4513C920,F4055,7C00
...,...,...,...
2121,09BF5150,262E7,29B3
2122,69286F45,2D5A3,28A7
2123,3E1E0D78,9D9EE,05A0
2124,57164AC1,7B638,0F8B


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [46]:
# Level 1 accuracy
print('LEVEL 1 ACCURACY: ', accuracy_score(y_test['Level_1'], level1_pred))


LEVEL 1 ACCURACY:  0.724202626641651


In [47]:
# Level 2 accuracy
print('LEVEL 2 ACCURACY: ', accuracy_score(y_test['Level_2'], results['Level2_Pred']))


LEVEL 2 ACCURACY:  0.6216385240775485


In [48]:
# Level 3 accuracy
print('LEVEL 3 ACCURACY: ', accuracy_score(y_test['Level_3'], results['Level3_Pred']))

LEVEL 3 ACCURACY:  0.6128830519074422


## Well done!