In [None]:
import os


In [None]:
# Set path to mallet 

os.environ["mallet_home"] = r"C:\mallet"
path_to_mallet = r"C:\mallet\bin\mallet"

In [None]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path
import pandas as pd
import random


In [None]:
# Import cleaned dataset

csv = "C:/Users/laram/Downloads/cleaned_wiredfull.csv"
csv2 = "C:/Users/laram/Downloads/finaledit.csv"



In [None]:
df_first = pd.read_csv(csv2)


In [None]:
clean_df = pd.read_csv(csv)


In [None]:
clean_df.head()

In [None]:
clean_df['text_content'] = df_first['text_content']


In [None]:
clean_df['text_content'] = clean_df['text_content'].astype(str)


In [None]:
# Further pre-processing with mallet 

training_data = [little_mallet_wrapper.process_string(text, numbers='remove') for text in clean_df['text_content']]


In [None]:
original_texts = [text for text in clean_df['text_content']]


In [None]:
clean_titles = [title for title in clean_df['title']]


In [None]:
# Fill null values in the tag column

clean_df['tag'] = clean_df['tag'].fillna('Unlabelled')


In [None]:
clean_tags = [tag for tag in clean_df['tag']]

In [None]:
little_mallet_wrapper.print_dataset_stats(training_data)


In [None]:
# Set number of topics K

num_topics = 40


In [None]:
training_data = training_data


In [None]:
#Set output directory path
output_directory_path = "C:/Users/laram/Downloads/ldamodel"

Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [None]:
# Train the LDA model

little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

In [None]:
# Load topic keys 

topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

data = [{"Topic Number": topic_number, "Topic Keys": topic} for topic_number, topic in enumerate(topics)]
topicdf = pd.DataFrame(data)

pd.set_option('display.max_colwidth', None)

print(topicdf.head(40))

# Save the DataFrame to a CSV file
output_path = "C:/Users/laram/Downloads/topickeys_40lda.csv"
topicdf.to_csv(output_path, index=False)



In [None]:
# Load topic distrubutions for each document 

topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

topic_distributions[0]


In [None]:
# Extract the topic with the highest proportion for each document

dominant_topics = []
for doc_id, distribution in enumerate(topic_distributions):
    dominant_topic = max(enumerate(distribution), key=lambda x: x[1])[0]
    dominant_proportion = max(distribution)
    dominant_topics.append({"Document ID": doc_id, "Dominant Topic": dominant_topic, "Proportion": dominant_proportion})

# Create a DataFrame from the list of results
df_dominant_topics = pd.DataFrame(dominant_topics)

# Display the DataFrame
print(df_dominant_topics)
df_dominant_topics.to_csv("C:/Users/laram/Downloads/dominant_topics_40lda.csv", index=False)


In [None]:
training_data_clean_titles = dict(zip(training_data, clean_titles))
training_data_original_text = dict(zip(training_data, original_texts))

In [None]:
# load topic word probabilities 

topic_word_probability_dict = little_mallet_wrapper.load_topic_word_distributions("C:/Users/laram/Downloads/ldamodel/mallet.word_weights.25")


In [None]:
# Extract the top 5 terms with the highest probability for each topic 

for _topic, _word_probability_dict in topic_word_probability_dict.items():
    print('Topic', _topic)
    for _word, _probability in sorted(_word_probability_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(round(_probability, 4), '\t', _word)
    print()

In [None]:
import ipywidgets as widgets
from IPython.display import display
import wordcloud

In [None]:
# Create word clouds with word proability list 

def create_word_cloud(topic_word_probabilities, topic_id,save_directory):
    # Extract the word probabilities for the given topic_id
    if topic_id not in topic_word_probabilities:
        print(f"Topic {topic_id} does not exist.")
        return

    word_probabilities = topic_word_probabilities[topic_id]

    # Create a WordCloud object
    wordcloud = WordCloud(width=800, height=400, background_color='white')

    # Generate word cloud using the word probabilities
    wordcloud.generate_from_frequencies(word_probabilities)
    
    # Sort word probabilities to get the words with frequencies
    sorted_word_probabilities = sorted(word_probabilities.items(), key=lambda x: x[1], reverse=True)
    top_20_words_with_frequencies = sorted_word_probabilities[:20]

    # Create the subplot layout
    fig, ax = plt.subplots(1, 2, figsize=(16, 8))

    # Plot the word cloud
    ax[0].imshow(wordcloud, interpolation='bilinear')
    ax[0].axis('off')
    ax[0].set_title(f'Word Cloud for Topic {topic_id}')

    # Display the top 20 words with frequencies
    ax[1].axis('off')
    ax[1].set_title('Top 20 Words with Frequencies')
    words_with_freq_text = "\n".join([f"{word}: {freq:.4f}" for word, freq in top_20_words_with_frequencies])
    ax[1].text(0.5, 0.5, words_with_freq_text, horizontalalignment='center', verticalalignment='center', fontsize=12)

    # Save the plot
    save_path = f"{save_directory}/cloud_topic_{topic_id}.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Word cloud for Topic {topic_id} saved to {save_path}")

In [None]:
# Create cloud for topic 6

topic_id = 6 
save_directory = "C:/Users/laram/Downloads"  
create_word_cloud(topic_word_probability_dict, topic_id, save_directory)

In [None]:
# Extract relevant information from the diagnostic files such as effective number of words score and u_mass coherence

import os
import xml.etree.ElementTree as ET

# Specify the number of topics you're interested in
num_topics = 25 # Example: Choose the number of topics you want to analyze

# Path to your diagnostics directory and the specific diagnostic file for the chosen number of topics
diagnostics_dir = "C:/Users/laram/Downloads/cohmodel"
diagnostic_file = os.path.join(diagnostics_dir, f"diagnostics_{num_topics}.xml")

# Check if the diagnostic file exists
if not os.path.exists(diagnostic_file):
    print(f"Diagnostic file for {num_topics} topics not found.")
else:
    # Parse the XML file
    tree = ET.parse(diagnostic_file)
    root = tree.getroot()
    
    # Extract scores
    topic_scores = []
    for topic in root.findall('.//topic'):
        topic_number = int(topic.get('id'))
        score = float(topic.get('eff_num_words'))
        topic_scores.append((topic_number, score))
    
    # Unzip topic numbers and scores
    topics, scores = zip(*topic_scores)
    
    # Plot scores 
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(topics, scores, s=scores, alpha=0.5, c=scores, cmap='viridis', edgecolors='k')
    
    # Annotate each circle with its topic number
    for topic_number, score in zip(topics, scores):
        plt.text(topic_number, score, str(topic_number), ha='center', va='center', fontsize=10)
    
    plt.colorbar(scatter, label='Coherence Score')
    plt.title(f'Effective Number of Words Score for {num_topics} Topics')
    plt.xlabel('Topic Number')
    plt.ylabel('Effective Number of Words Score')
    plt.grid(True)
    plt.tight_layout()
    save_path = "C:/Users/laram/Downloads/eff_num.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)


In [None]:
print(topics[0])

In [None]:
# Extract the top representative documents for each topic 

def display_top_topics(topic_number, topics, training_data, topic_distributions, training_data_clean_titles, number_of_documents=5):

    # Fetch top documents for the specified topic number
    top_documents = little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents)
    
    # Print topic number and keys
    print(f"✨Topic {topic_number}✨\n\n{' '.join(topics[topic_number])}\n")
    
    # Print top documents
    for probability, document_index in top_documents:
        document_title = training_data_clean_titles[document_index]
        print(f"Probability: {round(probability, 4)}")
        print(f"Title: {document_title}\n")
    
    # Create a DataFrame for top documents
    data = [{
        "Topic Number": topic_number,
        "Topic Keys": ", ".join(topics[topic_number]),
        "Probability": round(probability, 4),
        "Document Title": training_data_clean_titles[document_index]
    } for probability, document_index in top_documents]
    
    top_docs_df = pd.DataFrame(data)
    
    # Display and save DataFrame
    print(top_docs_df.head(10))
    top_docs_df.to_csv(f"C:/Users/laram/Downloads/topic_top_documents{topic_number}.csv", index=False)



In [None]:
# Set the topic number for analysis 

topic_number_to_analyze = 25 
display_top_topics(topic_number_to_analyze, topics, training_data, topic_distributions, training_data_clean_titles)

In [None]:
topics_10 = topics 

In [None]:
# Make DataFrame for results from K=10 model

def summarize_topics_to_df(topics):
    summaries = []
    for topic_id, topic_words in enumerate(topics):
        summaries.append({
            "Topic ID": topic_id,
            "Keywords": ", ".join(topic_words)
        })
    return pd.DataFrame(summaries)

summaries_df_10 = summarize_topics_to_df(topics_10)
print(summaries_df_10)
summaries_df_10.to_csv(f"C:/Users/laram/Downloads/summaries_topics10.csv", index=False)


In [None]:
display_top_titles_per_topic(topic_number=2, number_of_documents=5)


In [None]:
display_top_titles_per_topic(topic_number=3, number_of_documents=5)


In [None]:
# Create heatmap with tag categories as target labels 

topic_keys_file = r"C:\Users\laram\Downloads\ldamodel\mallet.topic_keys.25"
topic_distributions_file = r"C:\Users\laram\Downloads\ldamodel\mallet.topic_distributions.25"
output_directory_path = r"C:\Users\laram\Downloads"

# Load the topic keys
topics = little_mallet_wrapper.load_topic_keys(topic_keys_file)

# Load the topic distributions
topic_distributions = little_mallet_wrapper.load_topic_distributions(topic_distributions_file)

print(len(topics))
print(len(target_labels))
print(len(topic_distributions))

print("First entry in topic_distributions:", topic_distributions[0])
print("Length of topic_distributions:", len(topic_distributions))


target_labels = clean_tags

print("First 5 entries in clean_titles:", clean_titles[:5])
print("First 5 entries in target_labels:", target_labels[:5])

try:
    little_mallet_wrapper.plot_categories_by_topics_heatmap(
        clean_tags,
        topic_distributions,
        topics,
        output_directory_path + r'\categories_by_topics.pdf',
        target_labels=target_labels,
        dim=(18, 8)
    )
except Exception as e:
    print("Error during plotting:", str(e))

In [None]:
topic_keys_file = r"C:\Users\laram\Downloads\ldamodel\mallet.topic_keys.40"
topic_distributions_file = r"C:\Users\laram\Downloads\ldamodel\mallet.topic_distributions.40"
output_path = r"C:\Users\laram\Downloads\topics_over_time.png"

In [None]:
topic_keys = little_mallet_wrapper.load_topic_keys(topic_keys_file)

# Load the topic distributions
topic_distributions = little_mallet_wrapper.load_topic_distributions(topic_distributions_file)


In [None]:
clean_years = clean_df['year']

In [None]:
# Plot topics over time for a single topic 

topic_index = 8  # Set topic number
little_mallet_wrapper.plot_topics_over_time(topic_distributions, topic_keys, clean_years, topic_index, output_path=output_path)

In [None]:
topic_index = 19
output_path = r"C:\Users\laram\Downloads\topics_over_time_topic_{}.png".format(topic_index)


little_mallet_wrapper.plot_topics_over_time(topic_distributions, topic_keys, clean_years, topic_index, output_path=output_path)

In [None]:
topic_index = 35
output_path = r"C:\Users\laram\Downloads\topics_over_time_topic_{}.png".format(topic_index)
little_mallet_wrapper.plot_topics_over_time(topic_distributions, topic_keys, clean_years, topic_index, output_path=output_path)

In [None]:
# Extract the topic coherence scores 
import xml.etree.ElementTree as ET

# Load and parse the diagnostics file
file_path = "C:/Users/laram\Downloads/ldamodel/mallet.diagnostics.15.xml"  # Update with the actual path to your diagnostics file
tree = ET.parse(file_path)
root = tree.getroot()

# Extract topic metrics
topics = root.findall('.//topic')

# Print coherence for each topic
for topic in topics:
    topic_id = topic.get('id')
    coherence = topic.get('coherence')
    print(f'Topic {topic_id} Coherence: {coherence}')