In [1]:
import re
import nltk
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import TfidfModel
import os
from collections import Counter
import numpy as np

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kfang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kfang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kfang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kfang\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
os.chdir("C:\\Users\\*link to the file*")

In [None]:
file_path = "Race_2020s.txt" #do this separately for each decade
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

In [None]:
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to WordNet POS"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
def preprocess(doc):
    doc = doc.lower()
    doc = re.sub(r'[^a-z\s]', '', doc)
    tokens = word_tokenize(doc)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    return lemmatized

In [None]:
texts = [preprocess(p) for p in paragraphs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = TfidfModel(corpus, id2word=dictionary)

In [None]:
word_tfidf = {}
for doc in corpus:
    for word_id, freq in tfidf[doc]:
        word_tfidf[word_id] = word_tfidf.get(word_id, []) + [freq]

In [None]:
word_avg_tfidf = {word_id: sum(vals)/len(vals) for word_id, vals in word_tfidf.items()}
scores = list(word_avg_tfidf.values())
low_thresh = sorted(scores)[int(len(scores)*0.05)]
good_ids = {word_id for word_id, score in word_avg_tfidf.items()
            if low_thresh <= score }
filtered_texts = [[w for w in text if dictionary.token2id[w] in good_ids] for text in texts]
dictionary = corpora.Dictionary(filtered_texts)
corpus = [dictionary.doc2bow(text) for text in filtered_texts]

In [None]:
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=12,    #obtained for statistical testing and change by decade 
    random_state=42,
    passes=10,
    alpha='auto'
)

In [None]:
for idx, topic in lda_model.print_topics(num_words=15):
    print(f"Topic {idx}: {topic}")

In [None]:
topic_totals = np.zeros(lda_model.num_topics)
for doc_bow in corpus:
    topic_probs = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    for topic_id, prob in topic_probs:
        topic_totals[topic_id] += prob

In [None]:
topic_percentages = topic_totals / len(corpus) * 100
print("\n--- Topic Distribution (Probabilistic Shares) ---")
for topic_id, perc in enumerate(topic_percentages):
    print(f"Topic {topic_id}: {perc:.2f}% of corpus")