# TF-IDF demo with Python and Pandas
# using US Presidential Inaugural speeches

In [1]:
# February 28, 2017
# on the occasion of a collaborative exploration of a common dataset
# proposed by the organizers of the Digital Approaches Reading Group
# at Washington University in St. Louis

In [1]:
import glob
import numpy as np
import pandas as pd
import os
import re
from collections import Counter

In [2]:
def tokenize(txt):   # simplified tokenization for demo
    return [t for t in re.split("\W+", txt.lower()) if t]

In [3]:
def frameforpath(filepath):
    """Given a path to an inaugural file, speech tokenize it, count the tokens, 
    and create a dataframe where each row consists of filename, token, and count"""
    with open(filepath) as filehandle:
        txt = filehandle.read()
        tokens = tokenize(txt)
        filename = os.path.split(filepath)[-1].replace(".txt", "")
        tokencounts = Counter(tokens)
        # Counter gives us a dictionary of tokens and counts;
        # next we splice the filename into each row and make a dataframe
        datalist = [(filename, token, count) 
                    for (token, count) in tokencounts.items()]
        tmp = pd.DataFrame(datalist)
        tmp.columns = ['doc','word','count']        
        return tmp

In [4]:
# Create a single dataframe with all the tokens.
# Each inaugural's dataframe has the same structure,
# and so they can in turn each be appended to 
# make one frame.
filepaths = sorted(glob.glob("inauguralspeeches/*.txt"))
df = pd.DataFrame()
for filepath in filepaths:
    df = df.append(frameforpath(filepath))

In [5]:
# check it out:
print("The dimensions of the full frame: {0} rows, {1} columns".format(*df.shape))
df.head()

The dimensions of the full frame: 44681 rows, 3 columns


Unnamed: 0,doc,word,count
0,01_washington_1789,more,8
1,01_washington_1789,should,1
2,01_washington_1789,inapplicable,1
3,01_washington_1789,nor,2
4,01_washington_1789,than,6


In [6]:
# Now let's turn this into a document-word matrix of word counts, 
# filling in 0 for any empty cells

word_doc = df.pivot_table(index="word", columns="doc", values="count").fillna(0)

In [7]:
# Check out a subset:

word_doc.loc[["the","america","great","honor"],:]

doc,01_washington_1789,02_washington_1793,03_adams_john_1797,04_jefferson_1801,05_jefferson_1805,06_madison_1809,07_madison_1813,08_monroe_1817,09_monroe_1821,10_adams_john_quincy_1825,...,49_reagan_1981,50_reagan_1985,51_bush_george_h_w_1989,52_clinton_1993,53_clinton_1997,54_bush_george_w_2001,55_bush_george_w_2005,56_obama_2009,57_obama_2013,58_trump_2017
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,116.0,13.0,163.0,130.0,143.0,104.0,100.0,275.0,360.0,304.0,...,123.0,132.0,121.0,89.0,133.0,53.0,142.0,132.0,104.0,71.0
america,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,6.0,8.0,7.0,19.0,15.0,11.0,20.0,10.0,8.0,20.0
great,3.0,0.0,5.0,1.0,1.0,0.0,0.0,21.0,29.0,9.0,...,4.0,5.0,10.0,2.0,6.0,3.0,4.0,1.0,3.0,6.0
honor,0.0,1.0,7.0,2.0,0.0,2.0,1.0,1.0,2.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0


In [8]:
# Here words are rows and documents are columns, but if we change our minds and want the reverse, 
# transposition of a dataframe is easy:
word_doc.T.head()

word,000,03,04,05,1,100,120,125,13,14th,...,yours,yourself,yourselves,youth,youthful,zachary,zeal,zealous,zealously,zone
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_washington_1789,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02_washington_1793,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
03_adams_john_1797,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
04_jefferson_1801,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
05_jefferson_1805,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0


In [9]:
# The marginal sums along each axis will give us total counts for each word:

word_totals = word_doc.sum(axis=1)

In [10]:
# and total words in each document:

doc_totals = word_doc.sum(axis=0)

In [11]:
# Check these out (these are both of type Series rather than DataFrame):

word_totals.head()

word
000    13.0
03     27.0
04     26.0
05      2.0
1      22.0
dtype: float64

In [12]:
doc_totals.head()

doc
01_washington_1789    1436.0
02_washington_1793     140.0
03_adams_john_1797    2327.0
04_jefferson_1801     1731.0
05_jefferson_1805     2171.0
dtype: float64

In [13]:
# To calculate word frequencies in each document, we need to divide each word's count
# in its document by the total number of words in that document. We can do this easily 
# and logically in Pandas if we carefully specify the correct axis when dividing 
# our frame and our series. We'll call it "tf", but it's really a document-term frequency.

tf = word_doc.div(doc_totals, axis=1)

In [14]:
# Check out a subset, and compare with the numbers above:

tf.loc[["the","america","great","upbraidings"],:]

doc,01_washington_1789,02_washington_1793,03_adams_john_1797,04_jefferson_1801,05_jefferson_1805,06_madison_1809,07_madison_1813,08_monroe_1817,09_monroe_1821,10_adams_john_quincy_1825,...,49_reagan_1981,50_reagan_1985,51_bush_george_h_w_1989,52_clinton_1993,53_clinton_1997,54_bush_george_w_2001,55_bush_george_w_2005,56_obama_2009,57_obama_2013,58_trump_2017
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.08078,0.092857,0.070047,0.075101,0.065868,0.088136,0.082237,0.081337,0.080268,0.104038,...,0.05002,0.050536,0.051533,0.055108,0.061121,0.03296,0.067943,0.054704,0.048826,0.048168
america,0.0,0.007143,0.002149,0.0,0.0,0.0,0.0,0.0,0.000446,0.0,...,0.00244,0.003063,0.002981,0.011765,0.006893,0.006841,0.009569,0.004144,0.003756,0.013569
great,0.002089,0.0,0.002149,0.000578,0.000461,0.0,0.0,0.006211,0.006466,0.00308,...,0.001627,0.001914,0.004259,0.001238,0.002757,0.001866,0.001914,0.000414,0.001408,0.004071
upbraidings,0.0,0.007143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Just an example to double-check that our numbers are plausible:
trial_doc = "01_washington_1789"
trial_word = "the"
trial_count = int(word_doc.loc[trial_word,trial_doc])
trial_total = int(doc_totals[trial_doc])
trial_freq = tf.loc[trial_word,trial_doc]
print("In document '{}' \nthe word '{}' has frequency {:06f}, \n({} occurrences / {} total words)".format(
    trial_doc, trial_word, trial_freq, trial_count, trial_total
))

In document '01_washington_1789' 
the word 'the' has frequency 0.080780, 
(116 occurrences / 1436 total words)


To calculate TF-IDF, we multiply term frequencies by inverse document frequencies.
We already have TF (term frequencies) in our doc_term_freq dataframe.

IDF will be a Series rather than a frame. For each word, we count how many documents it appears in, divide the total number of documents by that number, and take the natural logarithm of the result. 

If we have a word that appears in just 1 document of a total of 58 documents, our IDF will be: $$ln (\frac{58}{1}) = 4.060443$$

In [16]:
# First we calculate document counts for each word, using a convenient fact
# about Python booleans as a shortcut. (docword>0) turns each cell into a Boolean value indicating
# whether or not the document-word count is nonzero. We can sum over these Boolean values, and True instances
# will count as 1, and False instances as 0.
df = (word_doc>0).sum(axis=1)

In [17]:
# Check it out (and let's try the other end of the series)
# df records how many documents each term appears in:
df.tail()

word
zachary      1
zeal         8
zealous      5
zealously    6
zone         1
dtype: int64

In [18]:
# We know we have 58 inaugurals in our current data, 
# but rather than hard-code that, we can get the number of documents from one of our dataframes:
numdocs = word_doc.shape[1]

In [19]:
# At last we calculate idf:
idf = np.log(numdocs/df)

In [20]:
# Check it out; words that appear in fewer documents have higher idf scores:

idf.tail()

word
zachary      4.060443
zeal         1.981001
zealous      2.451005
zealously    2.268684
zone         4.060443
dtype: float64

In [21]:
# At last we can calculate tf-idf:
tfidf = tf.mul(idf, axis=0)

In [22]:
# Check out a sample; for one document, we sort the tf-idf scores in descending order to see
# the characteristic words
trialdoc = '01_washington_1789'
tfidf.loc[:,[trialdoc]].sort_values(by=trialdoc, ascending=False)

doc,01_washington_1789
word,Unnamed: 1_level_1
immutable,0.004690
impressions,0.004690
providential,0.004690
qualifications,0.004125
your,0.003926
peculiarly,0.003724
retreat,0.003414
pecuniary,0.003414
article,0.003414
rendered,0.003414


In [23]:
docnames = doc_totals.index

In [24]:
for docname in doc_totals.index:
    topwords = list(tfidf.loc[:,[docname]].sort_values(by=docname, ascending=False)[docname].index[:8])
    print("{:>25s} ".format(docname)," ".join(topwords))

       01_washington_1789  immutable impressions providential qualifications your peculiarly retreat pecuniary
       02_washington_1793  arrive 1793 upbraidings willingly violated incurring injunctions previous
       03_adams_john_1797  pleasing houses legislatures virtuous amiable habitual legislature benevolence
        04_jefferson_1801  thousandth retire moments intolerance trusted principle him honest
        05_jefferson_1805  whatsoever false covered enlighten falsehood defamation licentiousness comforts
          06_madison_1809  improvements belligerent rendered distressing inadequacy exempted unwarrantable partialities
          07_madison_1813  british massacre prisoners savage cruel captives until enemy
           08_monroe_1817  trials naval put invasion dangers situated duly persevere
           09_monroe_1821  colonies occurrences preceding spain concluded fortifications 000 coast
10_adams_john_quincy_1825  union dissensions instituted performance candid whatsoever pic