# 1-8 Comparing Texts Redux

In [1]:
# IMPORTS
import pandas as pd#, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# import matplotlib.pyplot as plt

# Set plt parameters
# plt.rcParams['figure.dpi'] = 300
# plt.rcParams["figure.figsize"] = (10,5)

In [2]:
# This is a very creaky way to load data
files = ["A", "B", "C", "D", "E", "F", "G", "H", "mdg"]

strings = []
for i in files:
    # Create the path to the file
    the_file = "../data/1924/texts/"+i+".txt"
    # Read the file to a string
    the_string =  open(the_file, 'r').read()
    # Add the string to a list of strings
    strings.append(the_string)

print(len(strings), strings[8][0:50])

9 "Off there to the right -- somewhere -- is a large


In a previous lab, you calculated the term frequency for a document, and then for a set of documents. Term frequency is defined as the **relative frequency** of term t within document d. 

It's useful to explore within a single document and to compare terms across documents, perhaps beginning to establish what a document is about, but what if there was a way to weight TFs such that we lowered the value of terms that occur across a lot, if not all, documents?

Enter IDF. Developed in 1972 by Karen Jones, the inverse document frequency (IDF) is a measure of how much information a word provides: if it is common or rare across all documents.

It is the logarithmically scaled inverse fraction of the documents that contain the word. IDF is obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient. *Phew*.

We can weight our term frequencies then by the inverse document frequency to get a better sense of how a word contributes to the distinctiveness of a document (text). The math is simple `tf * idf`. Are you ready to do some math?

In [3]:
# Vectorize our texts with just one parameter: 
# no words that don't occur in at least two texts
vectorizer = TfidfVectorizer(lowercase = True,
                             min_df = 2,
                             stop_words='english')

# fit the model to the data 
tfidf = vectorizer.fit_transform(strings)

# We'll need these later
vocabulary = vectorizer.get_feature_names_out()

# see how many features we have
tfidf.shape

(9, 2337)

In [4]:
# Convert to a dataframe
df = pd.DataFrame(tfidf.toarray(), 
                  columns = vectorizer.get_feature_names_out())

# Label our rows
df["label"] = files
df.set_index("label", inplace=True)

# See what this looks like:
df.head(9)

Unnamed: 0_level_0,_that_,abandon,abandoning,ability,able,abrupt,abruptly,absolutely,absorbed,abstraction,...,ye,year,years,yelled,yellow,yes,york,young,younger,youth
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027842,0.0,0.08816,0.0,0.0
B,0.0,0.0,0.0,0.02084,0.011566,0.0,0.014286,0.0,0.036239,0.0,...,0.0,0.028571,0.011566,0.0,0.0,0.031355,0.014286,0.066189,0.014286,0.0
C,0.012755,0.0,0.0,0.0,0.007079,0.025511,0.034975,0.0,0.01109,0.012755,...,0.0,0.052463,0.077868,0.012755,0.008744,0.012794,0.052463,0.040512,0.008744,0.022181
D,0.0,0.0,0.014409,0.0,0.023989,0.0,0.009877,0.0,0.0,0.014409,...,0.0,0.0,0.007996,0.028818,0.049386,0.028906,0.0,0.019613,0.0,0.0
E,0.0,0.0,0.0,0.0,0.01717,0.0,0.0,0.0,0.0,0.0,...,0.0,0.031812,0.077264,0.0,0.0,0.093098,0.0,0.007019,0.0,0.0
F,0.0,0.028907,0.0,0.0,0.0,0.0,0.009908,0.014453,0.0,0.0,...,0.028907,0.0,0.072191,0.0,0.009908,0.0,0.0,0.019674,0.009908,0.025134
G,0.009458,0.018915,0.018915,0.009458,0.015746,0.0,0.0,0.009458,0.0,0.0,...,0.047288,0.019449,0.015746,0.0,0.006483,0.047433,0.019449,0.021456,0.019449,0.008223
H,0.0,0.0,0.0,0.0,0.00904,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.022333,0.040848,0.011166,0.103472,0.0,0.0
mdg,0.0,0.0,0.0,0.0,0.004893,0.008817,0.006044,0.0,0.007666,0.0,...,0.0,0.006044,0.009786,0.0,0.0,0.022109,0.012088,0.016001,0.006044,0.0


In [5]:
words = df.transpose()
words.head()

label,A,B,C,D,E,F,G,H,mdg
_that_,0.0,0.0,0.012755,0.0,0.0,0.0,0.009458,0.0,0.0
abandon,0.0,0.0,0.0,0.0,0.0,0.028907,0.018915,0.0,0.0
abandoning,0.0,0.0,0.0,0.014409,0.0,0.0,0.018915,0.0,0.0
ability,0.0,0.02084,0.0,0.0,0.0,0.0,0.009458,0.0,0.0
able,0.0,0.011566,0.007079,0.023989,0.01717,0.0,0.015746,0.00904,0.004893


In [6]:
# Uncomment this to see it work, but it's over 2000 lines long!
# Also, there are more color maps available. (See: pandas docs.)
# words.style.background_gradient(axis=None, cmap='Purples')

## Clustering Words = Topics

What we have in our first dataframe above is a document-term matrix (DTM), a matrix in which documents are arranged as rows (observations) and terms appear as features of those rows. 

What if there was some way to re-imagine this matrix in terms of the vector multiplication that assembled it? That is, could we imagine a vector ***W*** that specified the topics around which documents were assembled and then a vector ***H*** which described the relationships of words to those topics. If we could break the output matrix above into its constituent matrices, we might be able to glean what makes these documents tick.

<div class="alert alert-block alert-info">
Please note that the "guesstimation" approach used below, while frequently used in the past, is generally frowned upon in the current moment. There are better ways to proceed than guesstimating. They involve however casting a number of other spells first, like <b>k-means</b> that we will explore a little later in this course.
</div>

In [7]:
# Are there word clusters in this data set?
nmf = NMF(n_components=9, 
          random_state=1, 
          max_iter=500, 
          init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
nmf_W.shape

(9, 9)

In [8]:
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "{:d}: ".format(topic_idx)
        message += " ".join([feature_names[i] + ' ' + str(round(topic[i], 2)) + ','
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [9]:
display_topics(nmf, vocabulary, 10)

0: man 0.43, bird 0.39, tea 0.29, birds 0.29, bees 0.29, professor 0.29, mr 0.25, said 0.24, mother 0.24, oh 0.24,
1: father 0.54, priest 0.31, god 0.22, son 0.16, church 0.14, mouth 0.13, like 0.12, boy 0.12, sin 0.12, eyes 0.12,
2: henry 0.78, dance 0.19, said 0.13, bank 0.09, dancing 0.09, feet 0.08, read 0.07, time 0.07, moment 0.07, birthday 0.06,
3: roman 0.21, walls 0.19, man 0.18, family 0.18, la 0.18, cat 0.17, stone 0.16, did 0.12, unknown 0.11, altar 0.11,
4: general 0.65, said 0.22, island 0.11, animal 0.1, night 0.1, sea 0.09, tree 0.09, trail 0.09, mr 0.08, hunter 0.08,
5: cot 0.24, iron 0.16, real 0.15, ship 0.15, great 0.15, man 0.14, said 0.14, hospital 0.13, palms 0.12, nurse 0.11,
6: george 0.54, father 0.44, uncle 0.42, said 0.29, boat 0.26, baby 0.19, woman 0.17, road 0.17, water 0.16, camp 0.14,
7: mr 0.27, man 0.21, american 0.2, said 0.16, sir 0.15, morrow 0.11, round 0.11, friend 0.11, did 0.11, clear 0.09,
8: said 0.19, mr 0.17, golf 0.15, like 0.13, girl 0.13