# Term Frequency-Inverse Document Frequency (TF-IDF)

Definition: A numerical statistic that reflects how important a term is to a document in a collection or corpus. It is often used as a weight in information retrieval and text mining.

In [1]:
# import libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

In [4]:
data = pd.read_csv("data/spam.csv")
data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
data.columns = ["label", "text"]
data.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
corpus = []

for text in data["text"]:
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stopwords.words("english")]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = " ".join(text)
    corpus.append(text)

print(corpus)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(X.toarray())
print(vectorizer.get_feature_names_out())