In [1]:
import string
import pickle
from IPython.display import display

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, \
f1_score, recall_score, roc_auc_score, precision_score, make_scorer
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [2]:
# reading the corpus of cleaned data and asigning each column to the new dataframe

with open('corpus.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('final-categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [3]:
# total no of words in desciption column 

df["description"].str.split().str.len().sum()

738577

In [4]:
# splitting the X, y into X_train, X_test, y_train & y_test

X = np.array(df["description"])
y = np.array(df["primary_category"])

# le = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


In [5]:
# returns X_train_features and X_test_features after applying CountVectorizer on both 

def CountVec(X_train, X_test):
    count_vec = CountVectorizer()
    X_train_features = count_vec.fit_transform(X_train)
    X_test_features = count_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [6]:
# returns X_train_features and X_test_features after applying TFIDFVectorizer on both 

def TFIDF(X_train, X_test):
    tfidf_vec = TfidfVectorizer()
    X_train_features = tfidf_vec.fit_transform(X_train)
    X_test_features = tfidf_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [7]:
# returns a pipeline of CountVectorizer, followed by TFIDF Vectorizer and then followed by the ML model 

def CountVec_TFIDF_Pipeline(model):
    return Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', model)])

In [12]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha = 0.01)),
                     ])
clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.01))])

In [13]:
y_pred = clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype='<U33')

In [14]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.9616166751398069
                                   precision    recall  f1-score   support

                       Automotive       0.95      0.97      0.96       193
                        Baby Care       0.97      0.80      0.88       121
            Bags, Wallets & Belts       0.89      0.91      0.90        43
         Beauty and Personal Care       0.97      0.93      0.95       135
            Cameras & Accessories       1.00      0.80      0.89        15
                         Clothing       0.97      1.00      0.98      1218
                        Computers       0.87      0.89      0.88       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       1.00      0.94      0.97       259
                        Furniture       0.97      1.00      0.99        39
                           Gaming       0.50      0.70      0.58        10
Health & Personal Care Appliances       1.00      0.71      0.83         7
     

In [55]:
clf = MultinomialNB(alpha = 1)
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9202691860395225


In [62]:
y_train_pred = clf.predict(X_train_features)

y_train_pred

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [63]:
print(accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

0.9876048817696415
                                   precision    recall  f1-score   support

                       Automotive       1.00      0.99      0.99       819
                        Baby Care       0.87      0.93      0.90       362
            Bags, Wallets & Belts       0.97      1.00      0.98       222
         Beauty and Personal Care       1.00      0.99      1.00       575
            Cameras & Accessories       1.00      1.00      1.00        67
                         Clothing       1.00      0.99      0.99      4979
                        Computers       0.99      0.97      0.98       452
                          Eyewear       1.00      1.00      1.00         9
                         Footwear       1.00      1.00      1.00       968
                        Furniture       0.96      1.00      0.98       141
                           Gaming       0.83      1.00      0.91        25
Health & Personal Care Appliances       1.00      1.00      1.00        36
     

In [31]:
# Multinomial Naive Bayes Algorithm with Tfidf Vectorizer

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = MultinomialNB(alpha = 0.001)
clf.fit(X_train_features, y_train)

MultinomialNB(alpha=0.001)

In [32]:
y_pred = clf.predict(X_test_features)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype='<U33')

In [33]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.95      0.96      0.95       193
                        Baby Care       0.98      0.80      0.88       121
            Bags, Wallets & Belts       0.89      0.91      0.90        43
         Beauty and Personal Care       0.96      0.93      0.95       135
            Cameras & Accessories       1.00      0.87      0.93        15
                         Clothing       0.96      1.00      0.98      1218
                        Computers       0.88      0.88      0.88       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       1.00      0.91      0.95       259
                        Furniture       1.00      1.00      1.00        39
                           Gaming       0.57      0.80      0.67        10
Health & Personal Care Appliances       1.00      0.86      0.92         7
                   Home 

In [42]:
y_train_pred = clf.predict(X_train_features)

y_train_pred

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [43]:
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

                                   precision    recall  f1-score   support

                       Automotive       1.00      0.99      0.99       819
                        Baby Care       0.90      0.95      0.92       362
            Bags, Wallets & Belts       0.98      1.00      0.99       222
         Beauty and Personal Care       1.00      1.00      1.00       575
            Cameras & Accessories       1.00      1.00      1.00        67
                         Clothing       1.00      0.99      0.99      4979
                        Computers       0.99      0.97      0.98       452
                          Eyewear       1.00      1.00      1.00         9
                         Footwear       1.00      1.00      1.00       968
                        Furniture       0.97      1.00      0.98       141
                           Gaming       0.86      1.00      0.93        25
Health & Personal Care Appliances       1.00      1.00      1.00        36
                   Home 

In [44]:
# GradientBoostingClassifier Algorithm 

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier())])

In [45]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype=object)

In [46]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.97      0.98      0.98       193
                        Baby Care       0.95      0.85      0.90       121
            Bags, Wallets & Belts       0.98      0.93      0.95        43
         Beauty and Personal Care       0.96      0.93      0.95       135
            Cameras & Accessories       1.00      0.93      0.97        15
                         Clothing       0.96      0.99      0.98      1218
                        Computers       0.97      0.84      0.90       126
                          Eyewear       0.33      1.00      0.50         1
                         Footwear       1.00      0.98      0.99       259
                        Furniture       0.97      0.97      0.97        39
                           Gaming       0.67      0.80      0.73        10
Health & Personal Care Appliances       0.83      0.71      0.77         7
                   Home 

In [47]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier())])

In [48]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Computers', ..., 'Tools & Hardware',
       'Jewellery', 'Home Decor & Festive Needs'], dtype=object)

In [49]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       1.00      0.85      0.92       193
                        Baby Care       0.94      0.72      0.81       121
            Bags, Wallets & Belts       0.94      0.72      0.82        43
         Beauty and Personal Care       1.00      0.61      0.76       135
            Cameras & Accessories       0.93      0.87      0.90        15
                         Clothing       0.99      0.79      0.88      1218
                        Computers       0.12      0.97      0.21       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       0.99      0.68      0.81       259
                        Furniture       1.00      0.90      0.95        39
                           Gaming       0.50      0.50      0.50        10
Health & Personal Care Appliances       1.00      0.57      0.73         7
                   Home 

  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
count_vec = CountVectorizer()

X_train_features = count_vec.fit_transform(X_train)

X_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
X_test_features = count_vec.transform(X_test)

X_test_features

<3934x13595 sparse matrix of type '<class 'numpy.int64'>'
	with 82664 stored elements in Compressed Sparse Row format>

In [52]:
clf = BernoulliNB()
clf.fit(X_train_features, y_train)

BernoulliNB()

In [53]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Computers', ..., 'Tools & Hardware',
       'Jewellery', 'Home Decor & Festive Needs'], dtype=object)

In [54]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                                   precision    recall  f1-score   support

                       Automotive       1.00      0.85      0.92       193
                        Baby Care       0.94      0.72      0.81       121
            Bags, Wallets & Belts       0.94      0.72      0.82        43
         Beauty and Personal Care       1.00      0.61      0.76       135
            Cameras & Accessories       0.93      0.87      0.90        15
                         Clothing       0.99      0.79      0.88      1218
                        Computers       0.12      0.97      0.21       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       0.99      0.68      0.81       259
                        Furniture       1.00      0.90      0.95        39
                           Gaming       0.50      0.50      0.50        10
Health & Personal Care Appliances       1.00      0.57      0.73         7
                   Home 

In [20]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=5)),
                     ])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Computers' ... 'Tools & Hardware' 'Jewellery'
 'Home Decor & Festive Needs']
0.7534316217590239


In [21]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.782418001525553


In [22]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])
scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.58827413057891


In [29]:
# SVM Classifier with Count Vectorizer

from sklearn import svm

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = svm.SVC()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9659379766141332


In [30]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9951690821256038


In [28]:
clf = svm.SVC()
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.8636738449869064
