In [2]:
import pandas as pd
import numpy as np
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load your dataset
df = pd.read_csv('combined_data.csv')

# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens



In [4]:
# Apply preprocessing
df['processed_texts'] = df['content_text'].apply(preprocess)
prompt_id = 1
# Separate training data based on prompt
train_df = df[df['prompt_id'] != prompt_id]

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(train_df['processed_texts'])
corpus = [dictionary.doc2bow(text) for text in train_df['processed_texts']]

# Define number of topics for LDA
num_topics = 8
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)



In [5]:
# Directories for saving models
model_dir = "lda_models"
dict_dir = "dictionaries"
os.makedirs(model_dir, exist_ok=True)
os.makedirs(dict_dir, exist_ok=True)

# Save LDA model and dictionary
lda_model.save(os.path.join(model_dir, f"lda_model_{prompt_id}.model"))
dictionary.save(os.path.join(dict_dir, f"dictionary_{prompt_id}.dict"))

# Step 6: Evaluate Topics using Coherence Model (c_npmi)
coherence_model = CoherenceModel(model=lda_model, texts=train_df['processed_texts'], dictionary=dictionary, coherence='c_npmi')
topic_coherence = coherence_model.get_coherence_per_topic()
overall_coherence = coherence_model.get_coherence()

# Print topic coherence scores and overall coherence score
print(f"Overall Coherence Score: {overall_coherence}")
for i, coherence in enumerate(topic_coherence):
    print(f"Topic {i}: Coherence Score: {coherence}")

# Optionally, save coherence scores for further analysis
coherence_df = pd.DataFrame({
    'Topic': range(len(topic_coherence)),
    'Coherence Score': topic_coherence,
})
coherence_df.to_csv(f'topic_coherence_scores_{prompt_id}.csv', index=False)

# Print the top words for each topic
for i, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")



Overall Coherence Score: 0.04858702822274931
Topic 0: Coherence Score: -0.3437732479583802
Topic 1: Coherence Score: 0.09384976373931178
Topic 2: Coherence Score: 0.03084036137283422
Topic 3: Coherence Score: 0.09374108183695715
Topic 4: Coherence Score: 0.20973553643740792
Topic 5: Coherence Score: 0.05996625719599532
Topic 6: Coherence Score: 0.1381717879957849
Topic 7: Coherence Score: 0.10616468516208342
Topic 0: 0.033*"planned" + 0.013*"fixed" + 0.009*"brush" + 0.008*"rapid" + 0.007*"thay" + 0.006*"withstand" + 0.006*"peak" + 0.005*"contributed" + 0.005*"competing" + 0.004*"stunt"
Topic 1: 0.044*"book" + 0.026*"people" + 0.022*"library" + 0.015*"thing" + 0.015*"would" + 0.015*"offensive" + 0.015*"movie" + 0.014*"read" + 0.014*"child" + 0.014*"think"
Topic 2: 0.017*"laughter" + 0.016*"laugh" + 0.014*"would" + 0.014*"friend" + 0.012*"one" + 0.012*"laughing" + 0.012*"time" + 0.011*"make" + 0.010*"u" + 0.010*"people"
Topic 3: 0.033*"patient" + 0.026*"time" + 0.023*"get" + 0.022*"mom" 

In [8]:
# Load the saved LDA model and dictionary
lda_model = LdaModel.load(os.path.join(model_dir, f"lda_model_{prompt_id}.model"))
dictionary = corpora.Dictionary.load(os.path.join(dict_dir, f"dictionary_{prompt_id}.dict"))

# Preprocess the test set
test_df = df
test_df['processed_texts'] = test_df['content_text'].apply(preprocess)

# Obtain topic distribution and coherence for new texts
topic_distributions = []

for idx, text in enumerate(test_df['processed_texts']):
    # Convert text to bag-of-words
    new_bow = dictionary.doc2bow(text)

    # Get the topic distribution for the text
    topic_distribution = lda_model.get_document_topics(new_bow)
    topic_distributions.append(topic_distribution)

    # Display topic distribution and coherence score for each document
    print(f"Document {idx + 1}: Topic Distribution: {topic_distribution}")

# Add topic distribution and coherence scores to the DataFrame
test_df['topic_distribution'] = topic_distributions

# find max of topic_distribution
max_topic = []

for topic in topic_distributions:
    max_topic.append(max(topic, key=lambda x: x[1])[1])

test_df['highest_topic'] = max_topic 

# open hand_crafted_v3.csv
data = pd.read_csv('hand_crafted_v3.csv')

# rename essay_id to item_id in test_df
test_df.rename(columns={'essay_id': 'item_id'}, inplace=True)

# Merge the topic distribution and coherence scores on basis of item_id
merged_df = data.merge(test_df[['item_id', 
                                # 'topic_distribution',
                                'highest_topic']], on='item_id')

# save the dataset to a new CSV file
merged_df.to_csv(f'final_{prompt_id}.csv', index=False)


Document 1: Topic Distribution: [(0, 0.011787203), (1, 0.47840592), (2, 0.3221049), (3, 0.11517618), (5, 0.020931534), (6, 0.028791003), (7, 0.021949777)]
Document 2: Topic Distribution: [(1, 0.42514342), (2, 0.38311508), (3, 0.12342213), (5, 0.044364836), (6, 0.014650813)]
Document 3: Topic Distribution: [(1, 0.4971611), (2, 0.40300524), (4, 0.08086408), (5, 0.01499224)]
Document 4: Topic Distribution: [(1, 0.5673475), (2, 0.24184296), (3, 0.14404812), (4, 0.01863053), (5, 0.021115685)]
Document 5: Topic Distribution: [(1, 0.4823426), (2, 0.2972377), (3, 0.045174614), (5, 0.06716977), (7, 0.09862287)]
Document 6: Topic Distribution: [(1, 0.38544875), (2, 0.27874845), (3, 0.223446), (5, 0.07717723), (6, 0.031286176)]
Document 7: Topic Distribution: [(1, 0.4314093), (2, 0.3511182), (4, 0.11806068), (5, 0.019084724), (6, 0.057921935), (7, 0.013157494)]
Document 8: Topic Distribution: [(1, 0.52254015), (2, 0.35258773), (3, 0.10073132), (6, 0.015425046)]
Document 9: Topic Distribution: [(1