# Estimating Topics

In [1]:
# IMPORTS
# import numpy as np
# import pandas as pd 
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Set plt parameters
plt.rcParams['figure.dpi'] = 300
plt.rcParams["figure.figsize"] = (10,5)

In [2]:
# Import the data into a list of strings
screenplays = []
for p in Path('../queue/screenplays/').glob('*.txt'):
    with open(p, encoding="utf8", errors='ignore') as f:
        contents = f.read()
        screenplays.append(contents)

In [3]:
# While we're at it, let's grab file names
titles= []
for p in Path('../queue/screenplays/').glob('*.txt'):
    with open(p, encoding="utf8", errors='ignore') as f:
        title = p.name[:-4]
        titles.append(title)

The `%%time` cell magic allows us to see how long a cell takes to run. Please note that it must come before anything else in the cell, even comments!

In [4]:
print(len(screenplays))

2858


143,000 dimensions is *a lot* of dimensions. Since we are topic modeling this, let's see what we can do to reduce the words.

In [5]:
%%time

# Vectorize our texts while removing function words
# and words that occur in only one text
vectorizer = TfidfVectorizer(lowercase = True,
                             min_df = 10,
                             stop_words='english')

# fit the model to the data 
matrix = vectorizer.fit_transform(screenplays)

# We'll need these later
vocabulary = vectorizer.get_feature_names_out()

# see how many features we have
matrix.shape

CPU times: user 26 s, sys: 164 ms, total: 26.2 s
Wall time: 26.2 s


(2858, 47208)

In [6]:
%%time

wcss = [] 
for i in range(1, 30): 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(matrix) 
    wcss.append(kmeans.inertia_)

CPU times: user 7min, sys: 38.6 s, total: 7min 39s
Wall time: 1min


In [None]:
%%time

# Vectorize our texts while removing function words
# and words that occur in only one text
vectorizer = TfidfVectorizer(lowercase = True,
                             min_df = 2,
                             stop_words='english')

# fit the model to the data 
matrix = vectorizer.fit_transform(screenplays)

# We'll need these later
vocabulary = vectorizer.get_feature_names_out()

# see how many features we have
matrix.shape

In [None]:
plt.plot(range(1, 30), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 20, init = "k-means++", random_state = 42)
y_kmeans = kmeans.fit_predict(matrix)