In [1]:
import string
import pickle
from IPython.display import display

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, \
f1_score, recall_score, roc_auc_score, precision_score, make_scorer
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [2]:
# reading the corpus of cleaned data and asigning each column to the new dataframe

with open('clean-dataset/corpus.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('clean-dataset/final-categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [None]:
# total no of words in desciption column 

df["description"].str.split().str.len().sum()

In [4]:
# splitting the X, y into X_train, X_test, y_train & y_test

X = np.array(df["description"])
y = np.array(df["primary_category"])

# le = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


# Feature Extraction Functions

* Count Vectorizer
* TF-IDF Vectorizer
* Pipeline of Count Vectorizer followed by TF-IDF Vectorizer followed by the model

In [5]:
# returns X_train_features and X_test_features after applying CountVectorizer on both 

def CountVec(X_train, X_test):
    count_vec = CountVectorizer()
    X_train_features = count_vec.fit_transform(X_train)
    X_test_features = count_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [6]:
# returns X_train_features and X_test_features after applying TFIDFVectorizer on both 

def TFIDF(X_train, X_test):
    tfidf_vec = TfidfVectorizer()
    X_train_features = tfidf_vec.fit_transform(X_train)
    X_test_features = tfidf_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [7]:
# returns a pipeline of CountVectorizer, followed by TFIDF Vectorizer and then followed by the ML model 

def CountVec_TFIDF_Pipeline(model):
    return Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', model)])

# Multinomial Naive Bayes

In [24]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = MultinomialNB(alpha = 0.01)
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

MultinomialNB(alpha=0.01)

In [27]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [29]:
clf = MultinomialNB(alpha = 0.01)
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9202181920839543


In [13]:
# Multinomial Naive Bayes Algorithm with Tfidf Vectorizer

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = MultinomialNB(alpha = 0.001)
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

MultinomialNB(alpha=0.001)

In [17]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [23]:
clf = MultinomialNB(alpha = 0.01)
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9202181920839543


# Support Vector Machines (SVM)

In [None]:
# SVM Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = svm.SVC()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

In [None]:
clf = svm.SVC()
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

In [None]:
# SVM Classifier with Count Vectorizer

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = svm.SVC()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

In [None]:
clf = svm.SVC()
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

# Random Forest Classifier

In [None]:
# Random Forest Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = RandomForestClassifier()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

In [None]:
clf = RandomForestClassifier()
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

In [None]:
# Random Forest Classifier with TF-IDF Vectorizer

from sklearn import svm

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = RandomForestClassifier()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

In [None]:
clf = RandomForestClassifier()
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

# KNeighbours Classifier

In [None]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=5)),
                     ])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

In [None]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])
scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

# Gradient Boosting Classifier

In [33]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier(n_estimators=100)),
                ])
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9572953736654805


In [35]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9987287058225274


In [36]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])
scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9080142721029715
