In [None]:
# Lesson from Intro to Cultural Analytics - https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/08-Topic-Modeling-Text-Files.html

In [None]:
path_to_mallet = "/Users/jeriwieringa/mallet-2.0.8/bin/mallet"

In [None]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

In [None]:
directory = "../data/Example_texts/history/NYT-Obituaries/"

In [None]:
files = glob.glob(f"{directory}/*.txt")

In [None]:
files

In [None]:
help(little_mallet_wrapper)

In [None]:
help(glob)

In [None]:
import pandas
help(pandas.DataFrame)

Documentation for Little Mallet Wrapper - https://github.com/maria-antoniak/little-mallet-wrapper

In [None]:
training_data = []

for file in files:
    with open(file, encoding="utf-8") as f:
        text = f.read()
    processed_text = little_mallet_wrapper.process_string(text, numbers="remove")
    training_data.append(processed_text)

In [None]:
original_texts = []

for file in files:
    with open(file, encoding="utf-8") as f:
        text = f.read()
    original_texts.append(text)

In [None]:
obit_titles = [Path(file).stem for file in files]

In [None]:
obit_titles

# Create the Model of Topics

In [None]:
little_mallet_wrapper.print_dataset_stats(training_data)

In [None]:
num_topics = 15
training_data = training_data

In [None]:
output_directory_path = '../data/outputs/topic-model-output/NYT-Obit'

In [None]:
# First create the output directory if it does not already exist

Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

# Then create paths for all of the files Mallet will output

path_to_training_data = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [None]:
little_mallet_wrapper.quick_train_topic_model(path_to_mallet, 
                                             output_directory_path,
                                             num_topics,
                                             training_data)

# View Results

In [None]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

In [None]:
for topic_number, topic in enumerate(topics):
    print(f"🌟 Topic {topic_number} 🌟\n\n{topic}\n")

In [None]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

In [None]:
topic_distributions[32]

In [None]:
obituary_to_check = "1962-Marilyn-Monroe"

obit_number = obit_titles.index(obituary_to_check)

In [None]:
print(f"Topic Distributions for {obit_titles[obit_number]}\n")
for topic_number, (topic, topic_distribution) in enumerate(zip(topics, topic_distributions[obit_number])):
    print(f"🌟Topic {topic_number} {topic[:6]} 🌟\nProbability: {round(topic_distribution, 3)}\n")

# Visualize Result

In [None]:
import random
target_labels = random.sample(obit_titles, 10)

In [None]:
little_mallet_wrapper.plot_categories_by_topics_heatmap(obit_titles,
                                                       topic_distributions,
                                                       topics,
                                                       output_directory_path + '/categories_by_topics.pdf',
                                                       target_labels=target_labels,
                                                       dim= (13,9)
                                                       )

# Display top titles per topic

In [None]:
training_data_obit_titles = dict(zip(training_data, obit_titles))
training_data_original_text = dict(zip(training_data, original_texts))

In [None]:
def display_top_titles_per_topic(topic_number=0, number_of_documents=5):
    print(f"🌟Topic {topic_number}🌟\n\n{topics[topic_number]}\n")
    
    for probability,document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):
        print(round(probability, 4), training_data_obit_titles[document] + "\n")
    return

In [None]:
display_top_titles_per_topic(topic_number=0, number_of_documents=5)

What would you label this topic?

# Display Topic Words in Context

In [None]:
from IPython.display import Markdown, display
import re

def display_bolded_topic_words_in_context(topics=topics, topic_number=3, number_of_documents=3, custom_words=None):
    
    print(f"🌟Topic {topic_number}🌟\n\n{topics[topic_number]}\n")
    
    for probability, document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):
            
        probability = f"🌟🌟🌟\n\n**{probability}**"
        obit_title = f"**{training_data_obit_titles[document]}**"
        original_text = training_data_original_text[document]
        topic_words = topics[topic_number]
        topic_words = custom_words if custom_words != None else topic_words
        
        for word in topic_words:
            if word in original_text:
                original_text = re.sub(f"\\b{word}\\b", f"**{word}**", original_text)
        
        display(Markdown(probability)), display(Markdown(obit_title)), display(Markdown(original_text))
    return

In [None]:
display_bolded_topic_words_in_context(topic_number=3, number_of_documents=3)