**Importing necessary libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score
import nltk
nltk.download("stopwords")
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Reading the dataset**

In [None]:
df = pd.read_csv ("/content/drive/MyDrive/ecommerce.csv", header=None)
df.head(1)

Unnamed: 0,0,1
0,Household,Paper Plane Design Framed Wall Hanging Motivat...


**Renaming columns**

In [None]:
df.rename(columns = {0:'class', 1:'text'}, inplace = True)
df.head(1)

Unnamed: 0,class,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...


**Looking for missing values**

In [None]:
df.isnull().sum()

class    0
text     1
dtype: int64

**Dropping the only missing value found**

In [None]:
df = df.dropna()

**Checking for and removing duplicates**

In [None]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(50424, 2)
(27802, 2)


**Seems there were a lot of duplicates**

**Checking for class distribution**

In [None]:
df['class'].value_counts(normalize=True)

Household                 0.379973
Books                     0.225020
Clothing & Accessories    0.204086
Electronics               0.190922
Name: class, dtype: float64

**The dataset is not exactly balanced but it's not imbalanced either.**

**Encoding the classes**

In [None]:
df.replace ({"class": {'Household':0, 'Books':1, "Clothing & Accessories": 2, "Electronics": 3}}, inplace = True)

**Initializing porter stemmer**

In [None]:
port_stem = PorterStemmer()

**Creating a function for stemming the text**

In [None]:
def stemming (content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

**Applying stemming on the column and saving the results in another column**

In [None]:
df['content'] = df['text'].apply (stemming)

**Separating independent and dependent features**

In [None]:
x = df['content'].values
y = df['class'].values

**Initializing TFIDF vectorizer and applying it on the independent feature**

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit (x)
x = vectorizer.transform(x)

**Making train and test splits for model training**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2, shuffle = True)

**Model training and results evaluation**

In [None]:
xgb = XGBClassifier(n_estimators=80)
xgb.fit (x_train, y_train)
y_pred_train = xgb.predict (x_train)
y_pred_test = xgb.predict (x_test)

training_data_accuracy_xgb = accuracy_score(y_pred_train, y_train)
print ("Accuracy On Training Data:", training_data_accuracy_xgb)

test_data_accuracy_xgb = accuracy_score(y_pred_test, y_test)
print ("Accuracy On Test Data:", test_data_accuracy_xgb)

print ("Correct predictions Training Data:", sum (y_train == y_pred_train))
print ("Correct predictions On Test Data:", sum (y_test == y_pred_test))

print ("Incorrect predictions On Training Data:", sum (y_train != y_pred_train))
print ("Incorrect predictions On Test Data:", sum (y_test != y_pred_test))

print ("F1 Score On Training Data:", f1_score(y_train, y_pred_train, average='macro'))
print ("F1 Score On Test Data:", f1_score(y_test, y_pred_test, average='macro'))

print ('Precision On Training Data: %.3f' % precision_score(y_train, y_pred_train, average='macro'))
print ('Precision On Test Data: %.3f' % precision_score(y_test, y_pred_test, average='macro'))


print ('Recall On Training Data: %.3f' % recall_score(y_train, y_pred_train, average='macro'))
print ('Recall On Test Data: %.3f' % recall_score(y_test, y_pred_test, average='macro'))

Accuracy On Training Data: 0.9667730767501461
Accuracy On Test Data: 0.935623089372415
Correct predictions Training Data: 21502
Correct predictions On Test Data: 5203
Incorrect predictions On Training Data: 739
Incorrect predictions On Test Data: 358
F1 Score On Training Data: 0.9668610271023168
F1 Score On Test Data: 0.9356760604154817
Precision On Training Data: 0.969
Precision On Test Data: 0.941
Recall On Training Data: 0.965
Recall On Test Data: 0.931


**Confusion matrices of train and test data**

In [None]:
cm_train = confusion_matrix (y_train, y_pred_train)
cm_test = confusion_matrix (y_test, y_pred_test)

print ("Confusion matrix Of Test Data:\n", cm_train)
print ("Confusion matrix Of Test Data:\n", cm_test)

Confusion matrix Of Test Data:
 [[8226  127   42   56]
 [ 117 4843   20   25]
 [  39   28 4466    6]
 [ 197   77    5 3967]]
Confusion matrix Of Test Data:
 [[2012   53   21   27]
 [  66 1159   11   15]
 [  25   12 1095    3]
 [  92   22   11  937]]


**Taking user input and making predictions**

In [None]:
user_input = input("Please enter the text: ")

user_input_series = pd.Series([user_input], name='text')

user_input_df = user_input_series.to_frame()

user_input_df['content'] = user_input_df['text'].apply (stemming)

input_x = user_input_df['content'].values

input_x = vectorizer.transform(input_x)

prediction = xgb.predict(input_x)

if prediction == 0:
    print('This text corresponds to the household category.')
elif prediction == 1:
    print('This text corresponds to the books category.')
elif prediction == 2:
    print('This text corresponds to the clothing & accessories category.')
elif prediction == 3:
    print('This text corresponds to the electronics category.')

Please enter the text: Penguin Essentials My Family and Other Animals Review A bewitching book (Sunday Times)Durrell has an uncanny knack of discovering human as well as animal eccentricities (Sunday Telegraph) About the Author Gerald Durrell was born in Jamshedpur, India, in 1925. He returned to England in 1928 before settling on the island of Corfu with his family. In 1945 he joined the staff of Whipsnade Park as a student keeper, and in 1947 he led his first animal-collecting expedition to the Cameroons. He later undertook numerous further expeditions, visiting Paraguay, Argentina, Sierra Leone, Mexico, Mauritius, Assam and Madagascar. His first television programme, Two in the BushÂ¸ which documented his travels to New Zealand, Australia and Malaya was made in 1962; he went on to make seventy programmes about his trips around the world. In 1959 he founded the Jersey Zoological Park, and in 1964 he founded the Jersey Wildlife Preservation Trust. He was awarded the OBE in 1982. Encou