In [1]:
import pandas as pd
import spacy

In [2]:
df = pd.read_csv('ecommerceDataset.csv')
df.head()

Unnamed: 0,Label,Text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
df.shape

(50425, 2)

In [4]:
df[df.Text.isna()]

Unnamed: 0,Label,Text
28323,Books,
28326,Books,
28329,Books,
39330,Clothing & Accessories,


In [5]:
df.drop(index=df[df.Text.isna()].index, inplace=True)

In [6]:
print(df.Label.value_counts())
df.shape

Household                 19313
Books                     11817
Electronics               10621
Clothing & Accessories     8670
Name: Label, dtype: int64


(50421, 2)

In [9]:
lab= df.Label.unique()
d = dict(zip(df.Label.unique(), [i for i in range(len(lab))]))
d

{'Household': 0, 'Books': 1, 'Clothing & Accessories': 2, 'Electronics': 3}

In [10]:
df['encoded'] = df.Label.map(d)

df.sample()

Unnamed: 0,Label,Text,encoded
32411,Clothing & Accessories,Littly Front Open Kids Thermal Top & Pyjama Se...,2


In [11]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        tokens.append(token.lemma_)

    return " ".join(tokens)

In [12]:
df['preprocessed'] = df["Text"].apply(preprocess)
df.sample()

Unnamed: 0,Label,Text,encoded,preprocessed
18155,Household,3D PATTERN DECORATIVE WALLPAPER LATEST DESIGN ...,0,3d PATTERN DECORATIVE WALLPAPER LATEST DESIGN ...


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed, df.encoded, test_size=0.2, random_state=12, stratify=df.encoded)

In [14]:
y_train.value_counts(), y_test.value_counts()

(0    15450
 1     9453
 3     8497
 2     6936
 Name: encoded, dtype: int64,
 0    3863
 1    2364
 3    2124
 2    1734
 Name: encoded, dtype: int64)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorzier' , TfidfVectorizer()),
    ('classifier' , KNeighborsClassifier()),
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3863
           1       0.95      0.95      0.95      2364
           2       0.97      0.97      0.97      1734
           3       0.97      0.93      0.95      2124

    accuracy                           0.96     10085
   macro avg       0.96      0.95      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [16]:
X_test.head(5)

39338    STUDIO SHRINGAAR WOMEN golden POLY RAW silk SK...
31036    5 Club Morning elevate life author ROBIN SHARM...
39295    jwf Women Cotton Skirts Multicolour_Free Size ...
34039    Creature Pu Leather Wallet man multiple Card S...
34083                  Allen Solly Men Cotton Handkerchief
Name: preprocessed, dtype: object

In [20]:
pd.array(y_test.iloc[:10])

<PandasArray>
[2, 1, 2, 2, 2, 3, 0, 0, 1, 3]
Length: 10, dtype: int64

In [21]:
y_pred[:10]

array([2, 1, 2, 2, 2, 3, 0, 0, 1, 3], dtype=int64)