# 🔷 TF-IDF (Term Frequency – Inverse Document Frequency)
TF-IDF is a more sophisticated method than Bag of Words (BoW) for text vectorization. It not only considers the frequency of words in a document but also how important or unique a word is across all documents.



💡 Intuition <br>
Words that occur frequently in a document → high TF.

But if a word appears in many documents → low IDF (less informative).

So, common words like "the", "is", "and" get down-weighted.



In [1]:
import pandas as pd
messages=pd.read_csv('Datasets/spam.csv', sep=',', names=["label","message"])

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
wordlemmatize=WordNetLemmatizer()

In [3]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordlemmatize.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

## Create TF-IDF And NGrams

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [6]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.53, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65, 0, 0.65, 0, 0, 0, 0.26, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0.327, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.392, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0.86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.443, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.476, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0.673, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
tfidf=TfidfVectorizer(max_features=100,ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [8]:
tfidf.vocabulary_

{'ok lar': np.int64(78),
 'wif oni': np.int64(85),
 'wkly comp': np.int64(89),
 'comp win': np.int64(46),
 'win fa': np.int64(86),
 'cup final': np.int64(52),
 'dun say': np.int64(68),
 'early hor': np.int64(69),
 'already say': np.int64(9),
 'aid patent': np.int64(6),
 'callertune caller': np.int64(24),
 'caller press': np.int64(23),
 'copy friend': np.int64(49),
 'winner valued': np.int64(88),
 'customer selected': np.int64(53),
 'claim call': np.int64(39),
 'call claim': np.int64(16),
 'claim code': np.int64(40),
 'code kl': np.int64(43),
 'colour mobile': np.int64(45),
 'camera free': np.int64(27),
 'call mobile': np.int64(21),
 'co free': np.int64(42),
 'word thank': np.int64(92),
 'wont take': np.int64(91),
 'wonderful blessing': np.int64(90),
 'date sunday': np.int64(59),
 'eh remember': np.int64(74),
 'spell name': np.int64(80),
 'ha ha': np.int64(76),
 'da stock': np.int64(58),
 'aft finish': np.int64(1),
 'alright way': np.int64(11),
 'eat slice': np.int64(71),
 'worried know