# Quick Visualizations

In [None]:
# IMPORTS
from pathlib import Path
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Some quick voodoo to get higher res images:
plt.rcParams['figure.dpi'] = 300
plt.rcParams["figure.figsize"] = (10,5)

In [8]:
# A container to put things i:
strings = []

# And here are the things:
for child in Path('./legends/louisiana/').iterdir():
    if child.is_file():
        the_string =  open(child, 'rb').read()
        strings.append(the_string)

# A quick check:
print(len(strings))

27


In [27]:
filepaths = []
for child in Path('./legends/louisiana/').iterdir():
    if child.is_file():
        filepaths.append(str(child))

no_prefix = [ fp.removeprefix("legends/louisiana/") for fp in filepaths ]
labels = [ i.removesuffix(".txt") for i in no_prefix ]
print(labels)

['uls-009', 'uls-008', 'lau-013', 'anc-090', 'anc-091', 'loh-157', 'lau-014', 'loh-160', 'loh-161', 'loh-163', 'loh-162', 'anc-088', 'anc-089', 'loh-159', 'loh-165', 'loh-164', 'loh-158', 'uls-006', 'uls-007', 'uls-011', 'uls-005', 'uls-004', 'uls-010', 'uls-001', 'loh-162b', 'uls-003', 'uls-002']


In [36]:
# Vectorize our texts with just one parameter: 
# no words that don't occur in at least two texts
vectorizer = CountVectorizer(lowercase = True, min_df = 2, max_df = 0.9)

# fit the model to the data 
tfidf = vectorizer.fit_transform(strings)

# We'll need these later
vocabulary = vectorizer.get_feature_names_out()

# see how many features we have
tfidf.shape

(27, 459)

In [37]:
# Convert to a dataframe
df = pd.DataFrame(tfidf.toarray(), 
                  columns = vectorizer.get_feature_names_out())

# Label our rows
df["label"] = labels
df.set_index("label", inplace=True)

# See what this looks like:
df.head(27)

Unnamed: 0_level_0,able,about,according,active,actually,after,again,against,ago,ahead,...,working,would,wouldn,writing,yard,yeah,years,you,young,your
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
uls-009,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,1,0,0
uls-008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
lau-013,0,3,0,0,0,0,2,0,0,0,...,0,2,0,0,1,0,0,5,2,0
anc-090,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
anc-091,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,3,2,3,0,0
loh-157,0,0,0,0,1,2,0,0,0,0,...,0,1,0,0,1,0,0,5,0,0
lau-014,0,0,0,0,0,2,0,0,0,0,...,1,2,0,0,2,1,0,16,0,0
loh-160,0,5,1,0,0,2,0,0,0,0,...,0,3,1,0,0,0,1,6,1,0
loh-161,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
loh-163,0,0,0,1,0,1,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0


In [34]:
words = df.transpose()
words.head()

label,uls-009,uls-008,lau-013,anc-090,anc-091,loh-157,lau-014,loh-160,loh-161,loh-163,...,uls-006,uls-007,uls-011,uls-005,uls-004,uls-010,uls-001,loh-162b,uls-003,uls-002
1812,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1912,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
able,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
about,0,0,3,0,0,0,0,5,2,0,...,0,0,0,0,0,1,0,1,0,0
according,0,0,0,0,0,0,0,1,0,0,...,0,2,0,0,0,0,0,0,0,0


In [6]:
# Uncomment this to see it work, but it's over 2000 lines long!
# Also, there are more color maps available. (See: pandas docs.)
# words.style.background_gradient(axis=None, cmap='Purples')

## Clustering Words = Topics

What we have in our first dataframe above is a document-term matrix (DTM), a matrix in which documents are arranged as rows (observations) and terms appear as features of those rows. 

What if there was some way to re-imagine this matrix in terms of the vector multiplication that assembled it? That is, could we imagine a vector ***W*** that specified the topics around which documents were assembled and then a vector ***H*** which described the relationships of words to those topics. If we could break the output matrix above into its constituent matrices, we might be able to glean what makes these documents tick.

<div class="alert alert-block alert-info">
Please note that the "guesstimation" approach used below, while frequently used in the past, is generally frowned upon in the current moment. There are better ways to proceed than guesstimating. They involve however casting a number of other spells first, like <b>k-means</b> that we will explore a little later in this course.
</div>

In [7]:
# Are there word clusters in this data set?
nmf = NMF(n_components=9, 
          random_state=1, 
          max_iter=500, 
          init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
nmf_W.shape

(9, 9)

In [8]:
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "{:d}: ".format(topic_idx)
        message += " ".join([feature_names[i] + ' ' + str(round(topic[i], 2)) + ','
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [9]:
display_topics(nmf, vocabulary, 10)

0: man 0.43, bird 0.39, tea 0.29, birds 0.29, bees 0.29, professor 0.29, mr 0.25, said 0.24, mother 0.24, oh 0.24,
1: father 0.54, priest 0.31, god 0.22, son 0.16, church 0.14, mouth 0.13, like 0.12, boy 0.12, sin 0.12, eyes 0.12,
2: henry 0.78, dance 0.19, said 0.13, bank 0.09, dancing 0.09, feet 0.08, read 0.07, time 0.07, moment 0.07, birthday 0.06,
3: roman 0.21, walls 0.19, man 0.18, family 0.18, la 0.18, cat 0.17, stone 0.16, did 0.12, unknown 0.11, altar 0.11,
4: general 0.65, said 0.22, island 0.11, animal 0.1, night 0.1, sea 0.09, tree 0.09, trail 0.09, mr 0.08, hunter 0.08,
5: cot 0.24, iron 0.16, real 0.15, ship 0.15, great 0.15, man 0.14, said 0.14, hospital 0.13, palms 0.12, nurse 0.11,
6: george 0.54, father 0.44, uncle 0.42, said 0.29, boat 0.26, baby 0.19, woman 0.17, road 0.17, water 0.16, camp 0.14,
7: mr 0.27, man 0.21, american 0.2, said 0.16, sir 0.15, morrow 0.11, round 0.11, friend 0.11, did 0.11, clear 0.09,
8: said 0.19, mr 0.17, golf 0.15, like 0.13, girl 0.13