## In this notebook we'll be using TF-IDF to predict which category a certain review belongs to

### But first, a simple example

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Juan likes webscraping, Mathilde likes webscraping, Ilkay hates webscraping",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new bing version tomorrow",
    "Amazon is announcing new warehouse tomorrow",
    "I am eating pierogi and you are eating joloff rice"
]

In [2]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [3]:
v.vocabulary_

{'juan': 14,
 'likes': 15,
 'webscraping': 27,
 'mathilde': 16,
 'ilkay': 10,
 'hates': 9,
 'apple': 4,
 'is': 12,
 'announcing': 3,
 'new': 19,
 'iphone': 11,
 'tomorrow': 24,
 'tesla': 23,
 'model': 18,
 'google': 8,
 'pixel': 21,
 'microsoft': 17,
 'bing': 6,
 'version': 25,
 'amazon': 1,
 'warehouse': 26,
 'am': 0,
 'eating': 7,
 'pierogi': 20,
 'and': 2,
 'you': 28,
 'are': 5,
 'joloff': 13,
 'rice': 22}

In [4]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    
    indx = v.vocabulary_.get(word)
    
    
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
bing : 2.386294361119891
eating : 2.386294361119891
google : 2.386294361119891
hates : 2.386294361119891
ilkay : 2.386294361119891
iphone : 2.386294361119891
is : 1.2876820724517808
joloff : 2.386294361119891
juan : 2.386294361119891
likes : 2.386294361119891
mathilde : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pierogi : 2.386294361119891
pixel : 2.386294361119891
rice : 2.386294361119891
tesla : 2.386294361119891
tomorrow : 1.2876820724517808
version : 2.386294361119891
warehouse : 2.386294361119891
webscraping : 2.386294361119891
you : 2.386294361119891


In [5]:
print(transform_output.toarray())

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.24253563 0.24253563 0.
  0.         0.         0.24253563 0.48507125 0.24253563 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.72760688 0.        ]
 [0.         0.         0.         0.30333014 0.56212245 0.
  0.         0.         0.         0.         0.         0.56212245
  0.30333014 0.         0.         0.         0.         0.
  0.         0.30333014 0.         0.         0.         0.
  0.30333014 0.         0.         0.         0.        ]
 [0.         0.         0.         0.30333014 0.         0.
  0.         0.         0.         0.         0.         0.
  0.30333014 0.         0.         0.         0.         0.
  0.56212245 0.30333014 0.         0.         0.         0.56212245
  0.30333014 0.         0.         0.         0.        ]
 [0.         0.         0.         0.30333014 0.         0.
  0.         0.         0.5621

## E-commerce use case

In [6]:
import pandas as pd

df = pd.read_csv("Ecommerce_data.csv")
print(df.shape)
df.head(5)

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [7]:
df['label'].value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [8]:
df['label_num'] = df['label'].map({
    'Household' : 0, 
    'Books': 1, 
    'Electronics': 2, 
    'Clothing & Accessories': 3
})

#map function changes strings into intergers

#checking the results 
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text, 
    df.label_num, 
    test_size=0.2, 
    random_state=2022,
    stratify=df.label_num #stratify make sure to have the same proporcion in lables 
)

In [11]:
y_train.value_counts()

0    4800
2    4800
3    4800
1    4800
Name: label_num, dtype: int64

In [12]:
y_test.value_counts()

0    1200
2    1200
3    1200
1    1200
Name: label_num, dtype: int64

## First Model: KNN Classifier

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [22]:
sample_test= ['this product is not working as it is described in the website. The quality is really bad and I did not get a refound']
sample_test_cv = v.transform(sample_test)

In [23]:
 clf.predict(sample_test_cv)

AttributeError: lower not found

## Second model: Naive Bayes

In [14]:
from sklearn.naive_bayes import MultinomialNB


clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1200
           1       0.98      0.92      0.95      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.99      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



## Third model: Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.98      0.98      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.99      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800

