# 01-TF-IDF

We will here compute the TF-IDF on a corpus of newspaper headlines.

Begin by importing needed libraries:

In [1]:
# import needed libraries
import nltk
import numpy as np
import pandas as pd

Import the data into the file *headlines.csv*

In [2]:
# TODO: Load the dataset
import pandas as pd

# Load the dataset
df = pd.read_csv('headlines.csv')

# Display the first 5 rows of the dataframe
print(df.head())

As usual, check the dataset basic information.

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('headlines.csv')

# Print the first 10 rows
print(df.head(10))

# Print some basic information about the dataset
print(df.info())

   publish_date                                      headline_text
0      20170721  algorithms can make decisions on behalf of fed...
1      20170721  andrew forrests fmg to appeal pilbara native t...
2      20170721                           a rural mural in thallan
3      20170721  australia church risks becoming haven for abusers
4      20170721  australian company usgfx embroiled in shanghai...
5      20170721  australia suffers shock loss in womens world c...
6      20170721                                           big rigs
7      20170721  boy charged in connection with supermarket syr...
8      20170721  breaking bad creator vince gilligan on success...
9      20170721  breaking bad creator vince gilligan on walter ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   1999 non-null   int64 
 1   headline_text  1999 non

We will now perform preprocessing on this text data: tokenization, punctuation and stop words removal and stemming.

Hint: to do so, use NLTK, *pandas*'s method *apply*, lambda functions and list comprehension

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

# Load the dataset
df = pd.read_csv("headlines.csv")

# Tokenize the text
df['tokens'] = df['headline_text'].apply(lambda x: word_tokenize(x))

# Remove punctuation
punctuations = string.punctuation
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in punctuations])

# Remove stop words
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stem the words
stemmer = SnowballStemmer('english')
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])


Compute now the Bag of Words for our data, using scikit-learn.

Warning: since we used our own preprocessing, you have to bypass analyzer with identity function.

In [10]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
import os

# Set the data path to a directory where you have write permission
nltk.data.path.append(os.getcwd())

# Download the required data
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
import pandas as pd
df = pd.read_csv('headlines.csv')

# Preprocess the headlines
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

preprocessed_headlines = df['headline_text'].apply(lambda x: ' '.join([stemmer.stem(w) for w in word_tokenize(x) if w not in stop_words and w.isalpha()]))

# Apply BOW transformation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

bow = vectorizer.fit_transform(preprocessed_headlines)

# Print the vocabulary size
print("Vocabulary size:", len(vectorizer.vocabulary_))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 4156


You can check the shape of the BOW, the expected value is `(1999, 4165)`.

Now compute the Term Frequency and then the Inverse Document Frequency, and check the values are not only zeros.

In [11]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Define the compute_tf function
def compute_tf(documents):
    tf_matrix = []
    for doc in documents:
        # Tokenize the document
        tokens = word_tokenize(doc.lower())

        # Remove stop words
        filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

        # Compute the term frequency
        tf = {}
        for token in filtered_tokens:
            tf[token] = tf.get(token, 0) + 1
        tf_matrix.append(tf)

    return tf_matrix

# Define the documents
documents = [
    'This is the first document.',
    'This is the second document.',
    'And this is the third document.',
    'Is this the first document?'
]

# Compute the term frequency matrix
tf_matrix = compute_tf(documents)

# Print the results
print(tf_matrix)


[{'first': 1, 'document': 1, '.': 1}, {'second': 1, 'document': 1, '.': 1}, {'third': 1, 'document': 1, '.': 1}, {'first': 1, 'document': 1, '?': 1}]


In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load data
df = pd.read_csv('headlines.csv', usecols=['headline_text'])
documents = df['headline_text'].tolist()

# Pre-processing
def preprocess(documents):
    # Remove punctuation
    documents = [re.sub(r'[^\w\s]', '', doc) for doc in documents]

    # Tokenize
    tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_docs = [[word for word in doc if not word in stop_words] for doc in tokenized_docs]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_docs = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_docs]

    return lemmatized_docs

preprocessed_docs = preprocess(documents)
print(preprocessed_docs[:10])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joshuathomson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['algorithm', 'make', 'decision', 'behalf', 'federal', 'minister'], ['andrew', 'forrests', 'fmg', 'appeal', 'pilbara', 'native', 'title', 'ruling'], ['rural', 'mural', 'thallan'], ['australia', 'church', 'risk', 'becoming', 'abuser'], ['australian', 'company', 'usgfx', 'embroiled', 'shanghai', 'staff', 'standoff'], ['australia', 'suffers', 'shock', 'loss', 'woman', 'world', 'cup', 'semi'], ['big', 'rig'], ['boy', 'charged', 'connection', 'supermarket', 'syringe', 'incident'], ['breaking', 'bad', 'creator', 'vince', 'gilligan', 'success'], ['breaking', 'bad', 'creator', 'vince', 'gilligan', 'walter', 'white', 'tv']]


Compute finally the TF-IDF.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# example corpus of documents
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# create TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# fit and transform the corpus
tf_idf = vectorizer.fit_transform(corpus)

# print the vocabulary
print(vectorizer.vocabulary_)

# print the IDF scores
print(vectorizer.idf_)

# print the TF-IDF matrix
print(tf_idf.toarray())


{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


What are the 10 words with the highest and lowest TF-IDF on average?

In [21]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer

# create the vectorizer and transformer
vectorizer = TfidfVectorizer()
tfidf_transformer = TfidfTransformer()

# compute the term frequency matrix
counts = vectorizer.fit_transform(corpus)

# compute the tf-idf matrix
tfidf = tfidf_transformer.fit_transform(counts)

# create a list of (word, tf-idf average) pairs
tfidf_avg = []
for i, word in enumerate(vectorizer.vocabulary_):
    tfidf_avg.append((word, np.mean(tfidf[:, i].toarray())))

# sort the list by tf-idf in descending order
tfidf_avg = sorted(tfidf_avg, key=lambda x: x[1], reverse=True)

# print the 10 words with the highest tf-idf average
print("Top 10 words by average TF-IDF:")
for i in range(10):
    print("{:<15}: {:.4f}".format(tfidf_avg[i][0], tfidf_avg[i][1]))

# print the 10 words with the lowest tf-idf average
print("\nBottom 10 words by average TF-IDF:")
for i in range(-1, -11, -1):
    print("{:<15}: {:.4f}".format(tfidf_avg[i][0], tfidf_avg[i][1]))


Top 10 words by average TF-IDF:
is             : 0.3797
the            : 0.3531
first          : 0.2422
and            : 0.2422
one            : 0.2422
second         : 0.1820
this           : 0.1393
document       : 0.1393
third          : 0.1393


IndexError: list index out of range

Now let's compute the TF-IDF using scikit-learn on our preprocessed data (the one you used to compute the BOW).

In [22]:
# Import the module
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF
tfidf = vectorizer.fit_transform(corpus)



Compare the 10 highest and lowest TF-IDF words on average to the ones you had by yourself.

In [23]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# create the corpus
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# create the vectorizer and transformer
vectorizer = TfidfVectorizer()
tfidf_transformer = TfidfTransformer()

# compute the term frequency matrix
counts = vectorizer.fit_transform(corpus)

# compute the tf-idf matrix
tfidf = tfidf_transformer.fit_transform(counts)

# compute the average tf-idf scores for each word
tfidf_avg = []
for i, word in enumerate(vectorizer.vocabulary_):
    tfidf_avg.append((word, np.mean(tfidf[:, i].toarray())))

# sort the list by tf-idf in descending order
tfidf_avg = sorted(tfidf_avg, key=lambda x: x[1], reverse=True)

# print the 10 words with the highest tf-idf on average
print("Top 10 words with highest TF-IDF on average:")
for word, score in tfidf_avg[:10]:
    print(f"{word}: {score}")

# print the 10 words with the lowest tf-idf on average
print("\nTop 10 words with lowest TF-IDF on average:")
for word, score in tfidf_avg[-10:]:
    print(f"{word}: {score}")


Top 10 words with highest TF-IDF on average:
is: 0.3797333352764771
the: 0.35307900317259394
first: 0.2421757743247982
and: 0.2421757743247982
one: 0.2421757743247982
second: 0.18202144022735134
this: 0.13926610266483697
document: 0.13926610266483697
third: 0.13926610266483697

Top 10 words with lowest TF-IDF on average:
is: 0.3797333352764771
the: 0.35307900317259394
first: 0.2421757743247982
and: 0.2421757743247982
one: 0.2421757743247982
second: 0.18202144022735134
this: 0.13926610266483697
document: 0.13926610266483697
third: 0.13926610266483697


Do you have the same words? How do you explain it?