In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is annoucing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
    ]

In [None]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [None]:
print(v.vocabulary_)

{'thor': 26, 'eating': 11, 'pizza': 23, 'loki': 18, 'is': 17, 'ironman': 16, 'ate': 8, 'already': 0, 'apple': 6, 'annoucing': 4, 'new': 21, 'iphone': 15, 'tomorrow': 27, 'tesla': 25, 'announcing': 5, 'model': 20, 'google': 13, 'pixel': 22, 'microsoft': 19, 'surface': 24, 'amazon': 2, 'eco': 12, 'dot': 10, 'am': 1, 'biryani': 9, 'and': 3, 'you': 28, 'are': 7, 'grapes': 14}


In [None]:
i = v.vocabulary_.get('thor')
v.idf_[i]

np.float64(2.386294361119891)

In [None]:
# Print the odf of each word

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
  index = v.vocabulary_.get(word)

  # Get the score

  idf_score = v.idf_[index]
  print(f"{word}: {idf_score}")

already: 2.386294361119891
am: 2.386294361119891
amazon: 2.386294361119891
and: 2.386294361119891
annoucing: 2.386294361119891
announcing: 1.4700036292457357
apple: 2.386294361119891
are: 2.386294361119891
ate: 2.386294361119891
biryani: 2.386294361119891
dot: 2.386294361119891
eating: 1.9808292530117262
eco: 2.386294361119891
google: 2.386294361119891
grapes: 2.386294361119891
iphone: 2.386294361119891
ironman: 2.386294361119891
is: 1.1335313926245225
loki: 2.386294361119891
microsoft: 2.386294361119891
model: 2.386294361119891
new: 1.2876820724517808
pixel: 2.386294361119891
pizza: 2.386294361119891
surface: 2.386294361119891
tesla: 2.386294361119891
thor: 2.386294361119891
tomorrow: 1.2876820724517808
you: 2.386294361119891


In [None]:
# Print the transformed output
print(transform_output.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.         0.24266547 0.         0.         0.40286636
  0.         0.         0.         0.         0.24266547 0.11527033
  0.24266547 0.         0.         0.         0.         0.72799642
  0.         0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.51244924 0.
  0.51244924 0.         0.         0.         0.         0.
  0.         0.         0.         0.51244924 0.         0.24342231
  0.         0.         0.         0.27652569 0.         0.
  0.         0.         0.         0.27652569 0.        ]
 [0.         0.         0.         0.         0.         0.34504032
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.26606332
  0.         0.         0.56011275 0.30224568 0.         0.
  0.         0.56011275 0.         0.30224568 0.        ]
 [0.         0.         0.         0.         0.         0

#### Counstom use case

* E-commerce data
* 4 labels: Household, Electronics, Clothing and Books
* Task is to create a classification model that can predict a given description of a product and classify them as one of the labels using Tfidf Vectorization technique

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/GenAi/NLP/Ecommerce_data.csv")

In [None]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [None]:
df.shape

(24000, 2)

In [None]:
percentage_distribution = df['label'].value_counts(normalize=True)*100
print(percentage_distribution)

label
Household                 25.0
Electronics               25.0
Clothing & Accessories    25.0
Books                     25.0
Name: proportion, dtype: float64


In [None]:
df['label_num'] = df['label'].map({
    'Household':0,
    'Electronics':1,
    'Clothing & Accessories':2,
    'Books':3
})

In [None]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


## Train Test Split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(df.Text, df.label_num, test_size=0.2)

In [None]:
len(X_train), len(X_test)

(19200, 4800)

#### Tfidf Vectorizer

In [None]:
tf = TfidfVectorizer()
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)

#### Classification Model

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tf, y_train)

y_preds = clf.predict(X_test_tf)

In [None]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      1186
           1       0.96      0.93      0.94      1195
           2       0.97      0.97      0.97      1202
           3       0.96      0.97      0.96      1217

    accuracy                           0.95      4800
   macro avg       0.95      0.95      0.95      4800
weighted avg       0.95      0.95      0.95      4800



#### Testing in the new data

In [None]:
msg = ["Hasseb and I want to buy new set of tuxedo for Akbar's wedding"]

msg_tf = tf.transform(msg)
clf.predict(msg_tf)

array([0])