<a href="https://colab.research.google.com/github/iJianHuang/GenAI/blob/main/NLP_Basics_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [5]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [6]:
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [7]:
i = v.vocabulary_.get('thor')
print(i)
v.idf_[i]

25


np.float64(2.386294361119891)

In [8]:
# Print the idf of each word

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:

  indx = v.vocabulary_.get(word)

  #get the score
  idf_score = v.idf_[indx]

  print(f"{word}: {idf_score}")


already: 2.386294361119891
am: 2.386294361119891
amazon: 2.386294361119891
and: 2.386294361119891
announcing: 1.2876820724517808
apple: 2.386294361119891
are: 2.386294361119891
ate: 2.386294361119891
biryani: 2.386294361119891
dot: 2.386294361119891
eating: 1.9808292530117262
eco: 2.386294361119891
google: 2.386294361119891
grapes: 2.386294361119891
iphone: 2.386294361119891
ironman: 2.386294361119891
is: 1.1335313926245225
loki: 2.386294361119891
microsoft: 2.386294361119891
model: 2.386294361119891
new: 1.2876820724517808
pixel: 2.386294361119891
pizza: 2.386294361119891
surface: 2.386294361119891
tesla: 2.386294361119891
thor: 2.386294361119891
tomorrow: 1.2876820724517808
you: 2.386294361119891


In [9]:
transform_output.shape

(7, 28)

In [10]:
# Print the transformed output from tf-idf
print(transform_output.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26982671 0.
  0.         0.5680354  0.30652086 0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.

#### **Custom Use case**

- E-commerce data
- 4 labels: Household, Electronics, Clothing & Books
- Task is to create a classification model that can predict a given description of a product and classify them as one of the labels using TfIdf vectorization technique

In [14]:
df = pd.read_csv('/content/Ecommerce_data.csv')

In [15]:
df.head(5)

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [16]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [17]:
df.shape

(24000, 2)

In [32]:
label_map = {
    'Household': 0,
    'Electronics': 1,
    'Clothing & Accessories': 2,
    'Books': 3
}
print (label_map)
label_map_swapped = dict((v,k) for k,v in label_map.items())
print (label_map_swapped)

{'Household': 0, 'Electronics': 1, 'Clothing & Accessories': 2, 'Books': 3}
{0: 'Household', 1: 'Electronics', 2: 'Clothing & Accessories', 3: 'Books'}


In [27]:

df['label_num'] = df['label'].map(label_map)

In [28]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


#### **Train Test Split**

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Text, df.label_num, test_size=0.2)

In [21]:
len(X_train)

19200

In [22]:
len(X_test)

4800

#### **Tfidf Vectorizer**

In [23]:
tf = TfidfVectorizer()

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)

#### **Classification Model**

In [24]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tf,y_train)

y_pred = clf.predict(X_test_tf)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1214
           1       0.95      0.95      0.95      1130
           2       0.96      0.96      0.96      1233
           3       0.97      0.96      0.96      1223

    accuracy                           0.95      4800
   macro avg       0.95      0.95      0.95      4800
weighted avg       0.95      0.95      0.95      4800



#### **Testing on a new data**

In [34]:
#msg = ["Indira Designer Women's Art Mysore Silk Saree With Blouse Piece (Star-Red) This Saree Is Of Art Mysore Silk & Comes With Blouse Piece."]
msg = ["Satyajit's designer women art saree silk blouse piece, saree with pipili chandua work"]
msg_tf = tf.transform(msg)

result_index = clf.predict(msg_tf)[0]
print(label_map_swapped[result_index])

Clothing & Accessories
