In [1]:
# ------------------------------------------------------------
# LAB 5 Tutorial: TF-IDF Sentiment Analysis & Text Mining
# ------------------------------------------------------------
# TF-IDF (Term Frequency – Inverse Document Frequency)
# measures how important a word is in a document compared
# to all documents in the dataset (corpus).
# ------------------------------------------------------------

# Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Each sentence is treated as a separate document
# ------------------------------------------------------------
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "A quick brown dog jumps over the lazy fox."
]

In [3]:
# ------------------------------------------------------------

# Create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Learn vocabulary and calculate TF-IDF matrix
#--------------------
# fit_transform()
#     - learns the vocabulary
#     - calculates TF-IDF values for each word in each document.

tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

#--------------------

# Convert matrix into a readable table (DataFrame)
#     Converts the matrix into a table:
#        - Rows → documents
#        - Columns → words
#        - Values → TF-IDF score

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),                     # numeric values
    columns=tfidf_vectorizer.get_feature_names_out()  # word names
)

print("TF-IDF Matrix:\n")
print(tfidf_df)

TF-IDF Matrix:

      brown       dog       fox      jump     jumps      lazy     never  \
0  0.348755  0.270840  0.348755  0.000000  0.348755  0.270840  0.000000   
1  0.000000  0.281715  0.000000  0.476986  0.000000  0.281715  0.476986   
2  0.394903  0.306678  0.394903  0.000000  0.394903  0.306678  0.000000   

       over     quick   quickly       the  
0  0.270840  0.348755  0.000000  0.541679  
1  0.281715  0.000000  0.476986  0.281715  
2  0.306678  0.394903  0.000000  0.306678  


In [4]:
# ------------------------------------------------------------

# Sum TF-IDF scores of each term across all documents
#  Adds TF-IDF scores across all documents
#      - Helps identify globally important words.
term_importance = tfidf_matrix.sum(axis=0).A1

# Get list of words
terms = tfidf_vectorizer.get_feature_names_out()

# Create DataFrame showing word importance
importance_df = pd.DataFrame({
    "Term": terms,
    "Importance": term_importance
})

# Sort from highest to lowest importance
importance_df = importance_df.sort_values(
    by="Importance",
    ascending=False
)

print("\nMost Important Terms in Corpus:\n")
print(importance_df)



Most Important Terms in Corpus:

       Term  Importance
10      the    1.130072
5      lazy    0.859233
1       dog    0.859233
7      over    0.859233
0     brown    0.743659
2       fox    0.743659
4     jumps    0.743659
8     quick    0.743659
3      jump    0.476986
6     never    0.476986
9   quickly    0.476986


In [5]:
#Exercise : TF-IDF with Scikit-Learn — Introduction to Cultural Analytics & Python

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
#pd.set_option("max_rows", 600)
from pathlib import Path  
import glob

# Go to the GitHub repo for the book:
# https://github.com/melaniewalsh/Intro-Cultural-Analytics
# Click Code → Download ZIP,  Unzip it.

In [7]:
import os
import glob

directory_path = "US_Inaugural_Addresses/"

print(os.listdir(directory_path)[:5])   # check files exist

text_files = glob.glob(f"{directory_path}/*.txt")

['01_washington_1789.txt', '02_washington_1793.txt', '03_adams_john_1797.txt', '04_jefferson_1801.txt', '05_jefferson_1805.txt']


In [8]:
text_files = glob.glob(f"{directory_path}/*.txt")

In [9]:
text_files

['US_Inaugural_Addresses\\01_washington_1789.txt',
 'US_Inaugural_Addresses\\02_washington_1793.txt',
 'US_Inaugural_Addresses\\03_adams_john_1797.txt',
 'US_Inaugural_Addresses\\04_jefferson_1801.txt',
 'US_Inaugural_Addresses\\05_jefferson_1805.txt',
 'US_Inaugural_Addresses\\06_madison_1809.txt',
 'US_Inaugural_Addresses\\07_madison_1813.txt',
 'US_Inaugural_Addresses\\08_monroe_1817.txt',
 'US_Inaugural_Addresses\\09_monroe_1821.txt',
 'US_Inaugural_Addresses\\10_adams_john_quincy_1825.txt',
 'US_Inaugural_Addresses\\11_jackson_1829.txt',
 'US_Inaugural_Addresses\\12_jackson_1833.txt',
 'US_Inaugural_Addresses\\13_van_buren_1837.txt',
 'US_Inaugural_Addresses\\14_harrison_1841.txt',
 'US_Inaugural_Addresses\\15_polk_1845.txt',
 'US_Inaugural_Addresses\\16_taylor_1849.txt',
 'US_Inaugural_Addresses\\17_pierce_1853.txt',
 'US_Inaugural_Addresses\\18_buchanan_1857.txt',
 'US_Inaugural_Addresses\\19_lincoln_1861.txt',
 'US_Inaugural_Addresses\\20_lincoln_1865.txt',
 'US_Inaugural_Addre

In [10]:
text_titles = [Path(text).stem for text in text_files]

In [11]:
text_titles

['01_washington_1789',
 '02_washington_1793',
 '03_adams_john_1797',
 '04_jefferson_1801',
 '05_jefferson_1805',
 '06_madison_1809',
 '07_madison_1813',
 '08_monroe_1817',
 '09_monroe_1821',
 '10_adams_john_quincy_1825',
 '11_jackson_1829',
 '12_jackson_1833',
 '13_van_buren_1837',
 '14_harrison_1841',
 '15_polk_1845',
 '16_taylor_1849',
 '17_pierce_1853',
 '18_buchanan_1857',
 '19_lincoln_1861',
 '20_lincoln_1865',
 '21_grant_1869',
 '22_grant_1873',
 '23_hayes_1877',
 '24_garfield_1881',
 '25_cleveland_1885',
 '26_harrison_1889',
 '27_cleveland_1893',
 '28_mckinley_1897',
 '29_mckinley_1901',
 '30_roosevelt_theodore_1905',
 '31_taft_1909',
 '32_wilson_1913',
 '33_wilson_1917',
 '34_harding_1921',
 '35_coolidge_1925',
 '36_hoover_1929',
 '37_roosevelt_franklin_1933',
 '38_roosevelt_franklin_1937',
 '39_roosevelt_franklin_1941',
 '40_roosevelt_franklin_1945',
 '41_truman_1949',
 '42_eisenhower_1953',
 '43_eisenhower_1957',
 '44_kennedy_1961',
 '45_johnson_1965',
 '46_nixon_1969',
 '47_

## Calculate tf–idf
To calculate tf–idf scores for every word, we're going to use scikit-learn's [`TfidfVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

When you initialize TfidfVectorizer, you can choose to set it with different parameters. These parameters will change the way you calculate tf–idf.

In [12]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')

In [13]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

Make a DataFrame out of the resulting tf–idf vector, setting the "feature names" or words as columns and the titles as rows

In [14]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())

Add column for document frequency aka number of times word appears in all documents

In [15]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [16]:
tfidf_slice = tfidf_df[['government', 'borders', 'people', 'obama', 'war', 'honor','foreign', 'men', 'women', 'children']]
tfidf_slice.sort_index().round(decimals=2)

Unnamed: 0,government,borders,people,obama,war,honor,foreign,men,women,children
00_Document Frequency,53.0,5.0,56.0,3.0,45.0,32.0,32.0,47.0,15.0,22.0
01_washington_1789,0.11,0.0,0.05,0.0,0.0,0.0,0.0,0.02,0.0,0.0
02_washington_1793,0.06,0.0,0.05,0.0,0.0,0.08,0.0,0.0,0.0,0.0
03_adams_john_1797,0.16,0.0,0.19,0.0,0.01,0.1,0.12,0.04,0.0,0.0
04_jefferson_1801,0.16,0.0,0.01,0.0,0.01,0.04,0.0,0.04,0.0,0.0
05_jefferson_1805,0.03,0.0,0.0,0.0,0.04,0.0,0.06,0.01,0.0,0.02
06_madison_1809,0.0,0.0,0.02,0.0,0.02,0.05,0.05,0.0,0.0,0.0
07_madison_1813,0.04,0.0,0.04,0.0,0.25,0.02,0.02,0.0,0.0,0.0
08_monroe_1817,0.17,0.0,0.11,0.0,0.09,0.01,0.1,0.04,0.0,0.0
09_monroe_1821,0.08,0.0,0.06,0.0,0.11,0.02,0.04,0.01,0.0,0.01


Let's drop "OO_Document Frequency" since we were just using it for illustration purposes.

In [17]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')

Let's reorganize the DataFrame so that the words are in rows rather than columns.

In [18]:
tfidf_df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,01_washington_1789,000,0.000000
1,01_washington_1789,03,0.000000
2,01_washington_1789,04,0.023259
3,01_washington_1789,05,0.000000
4,01_washington_1789,100,0.000000
...,...,...,...
521937,58_trump_2017,zachary,0.000000
521938,58_trump_2017,zeal,0.000000
521939,58_trump_2017,zealous,0.000000
521940,58_trump_2017,zealously,0.000000


In [19]:
tfidf_df = tfidf_df.stack().reset_index()

In [20]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

To find out the top 10 words with the highest tf–idf for every story, we're going to sort by document and tfidf score and then groupby document and take the first 10 values.

In [21]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

Unnamed: 0,document,term,tfidf
3707,01_washington_1789,government,0.113681
4108,01_washington_1789,immutable,0.103883
4175,01_washington_1789,impressions,0.103883
6337,01_washington_1789,providential,0.103883
5631,01_washington_1789,ought,0.103728
...,...,...,...
518409,58_trump_2017,obama,0.120288
518766,58_trump_2017,people,0.112370
521001,58_trump_2017,thank,0.109171
513989,58_trump_2017,borders,0.107075


In [22]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

We can zoom in on particular words and particular documents.

In [23]:
top_tfidf[top_tfidf['term'].str.contains('women')]

Unnamed: 0,document,term,tfidf
503861,56_obama_2009,women,0.084859


In [24]:
top_tfidf[top_tfidf['document'].str.contains('obama')]

Unnamed: 0,document,term,tfidf
495406,56_obama_2009,america,0.148351
500298,56_obama_2009,nation,0.120229
500358,56_obama_2009,new,0.118002
503093,56_obama_2009,today,0.114792
498590,56_obama_2009,generation,0.100654
499762,56_obama_2009,let,0.0911
499578,56_obama_2009,jobs,0.090727
496911,56_obama_2009,crisis,0.087235
498779,56_obama_2009,hard,0.084859
503861,56_obama_2009,women,0.084859


In [25]:
top_tfidf[top_tfidf['document'].str.contains('trump')]

Unnamed: 0,document,term,tfidf
513404,58_trump_2017,america,0.350162
515585,58_trump_2017,dreams,0.156436
513405,58_trump_2017,american,0.149226
517576,58_trump_2017,jobs,0.142766
519262,58_trump_2017,protected,0.132439
518409,58_trump_2017,obama,0.120288
518766,58_trump_2017,people,0.11237
521001,58_trump_2017,thank,0.109171
513989,58_trump_2017,borders,0.107075
521596,58_trump_2017,ve,0.107075


In [26]:
top_tfidf[top_tfidf['document'].str.contains('kennedy')]

Unnamed: 0,document,term,tfidf
391774,44_kennedy_1961,let,0.267869
394306,44_kennedy_1961,sides,0.262849
392921,44_kennedy_1961,pledge,0.16096
387632,44_kennedy_1961,ask,0.107713
387864,44_kennedy_1961,begin,0.106495
388991,44_kennedy_1961,dare,0.106495
395895,44_kennedy_1961,world,0.10311
390313,44_kennedy_1961,final,0.102311
392370,44_kennedy_1961,new,0.0966
390120,44_kennedy_1961,explore,0.094223


## Visualize TF-IDF
We can also visualize our TF-IDF results with the data visualization library Altair.

In [29]:
# Uncomment the line below to install Altair for visualization
#!pip install altair

Let's make a heatmap that shows the highest TF-IDF scoring words for each president, and let's put a red dot next to two terms of interest: "war" and "peace":


In [30]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)