In [52]:
import pandas as pd
import fasttext
from sklearn.model_selection import train_test_split
import re

In [53]:
df = pd.read_csv("../datasets/ecommerceDataset.csv", header=None, names=['category', 'description'])
df.head()

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [54]:
df.category.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

In [55]:
print(df.isna().any())
df = df.dropna()
print(df.isna().any())

category       False
description     True
dtype: bool
category       False
description    False
dtype: bool


In [56]:
df['category'].replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [57]:
def preprocess_category(text):
    text = text.replace(', ', ' __label__')
    return f"__label__{text}"

In [58]:
df['category'] = df['category'].map(preprocess_category)

In [59]:
def preprocess_description(text):
    text = re.sub(r'[^\w\s\']', '', text)
    return text.strip().lower()

In [60]:
df['description'] = df['description'].map(preprocess_description)

In [61]:
df.head()

Unnamed: 0,category,description
0,__label__Household,paper plane design framed wall hanging motivat...
1,__label__Household,saf 'floral' framed painting wood 30 inch x 10...
2,__label__Household,saf 'uv textured modern art print framed' pain...
3,__label__Household,saf flower print framed painting synthetic 135...
4,__label__Household,incredible gifts india wooden happy birthday u...


In [62]:
train, test = train_test_split(df, test_size=0.2)

In [63]:
train.to_csv("comm.train", columns=['category', 'description'], sep='\t', index=False, header=False)
train.to_csv("comm.test", columns=['category', 'description'], sep='\t', index=False, header=False)

In [64]:
model = fasttext.train_supervised(input="datasets/comm.train")
model.test("datasets/comm.test")

Read 4M words
Number of words:  114421
Number of labels: 4
Progress: 100.0% words/sec/thread: 2092675 lr:  0.000000 avg.loss:  0.190736 ETA:   0h 0m 0s


(40339, 0.9824735367758248, 0.9824735367758248)

In [70]:
model.predict("integral calculus for beginners about the author an editorial team of highly skilled professionals at arihant works hand in glove to ensure that the students receive the best and accurate content through our books from inception till the book comes out from print the whole team comprising of authors editors proofreaders and various other involved in shaping the book put in their best efforts knowledge and experience to produce the rigorous content the students receive keeping in mind the specific requirements of the students and various examinations the carefully designed exam oriented and exam ready content comes out only after intensive research and analysis the experts have adopted whole new style of presenting the content which is easily understandable leaving behind the old traditional methods which once used to be the most effective they have been developing the latest content  updates as per the needs and requirements of the students making our books a hallmark for quality and reliability for the past 15 years")

(('__label__Books',), array([0.9304269]))

In [69]:
model.get_nearest_neighbors("computer")

[(0.9967069625854492, 'cd'),
 (0.9931968450546265, 'mount'),
 (0.9917581677436829, 'games'),
 (0.9912920594215393, 'reader'),
 (0.9900073409080505, 'general'),
 (0.9894997477531433, 'capture'),
 (0.9887290000915527, 'electronics'),
 (0.9886495471000671, 'engine'),
 (0.9883531332015991, 'software'),
 (0.9881577491760254, 'notebook')]