In [7]:
import string
import pickle
import joblib
from IPython.display import display

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, \
f1_score, recall_score, roc_auc_score, precision_score, make_scorer
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [8]:
# reading the corpus of cleaned data and asigning each column to the new dataframe

with open('clean-dataset/corpus.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('clean-dataset/final-categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [10]:
# splitting the X, y into X_train, X_test, y_train & y_test

X = np.array(df["description"])
y = np.array(df["primary_category"])

# le = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


# XLNet 

In [19]:
clf = Pipeline([('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
                ('clf', XGBClassifier()),
                ])
clf.fit(X_train, y_train)

joblib.dump(clf, 'trained-models/xgboost.pkl')
# clf = joblib.load('trained-models/gradient-boosting.pkl') 

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.974580579562786
                                   precision    recall  f1-score   support

                       Automotive       0.96      0.97      0.97       193
                        Baby Care       0.96      0.90      0.93       121
            Bags, Wallets & Belts       0.98      0.95      0.96        43
         Beauty and Personal Care       0.96      0.96      0.96       135
            Cameras & Accessories       1.00      0.80      0.89        15
                         Clothing       0.99      1.00      1.00      1218
                        Computers       0.92      0.96      0.94       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       1.00      1.00      1.00       259
                        Furniture       1.00      1.00      1.00        39
                           Gaming     

In [20]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9977116704805492


In [None]:
clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', XGBClassifier()),
                     ])
scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))



