In [None]:
import pandas as pd
import numpy as np
import pickle
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os

# Load your dataset
df = pd.read_csv('combined_data.csv')

# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Apply preprocessing
df['processed_texts'] = df['content_text'].apply(preprocess)

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(df['processed_texts'])
corpus = [dictionary.doc2bow(text) for text in df['processed_texts']]

# Step 5: Define Topics for Coherence Evaluation
num_topics = 8  # You can set this to the desired number of topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

# Save the LDA model
model_dir = "lda_models"
os.makedirs(model_dir, exist_ok=True)
model_filename = os.path.join(model_dir, "lda_model.joblib")
pickle.dump(lda_model, model_filename)

# Step 6: Evaluate Topics using Coherence Model
coherence_model = CoherenceModel(model=lda_model, texts=df['processed_texts'], dictionary=dictionary, coherence='c_npmi')
topic_coherence = coherence_model.get_coherence_per_topic()



# Print topic coherence scores
for i, coherence in enumerate(topic_coherence):
    print(f'Topic {i}: Coherence Score: {coherence}')

# Optionally, save coherence scores for further analysis
coherence_df = pd.DataFrame({
    'Topic': range(len(topic_coherence)),
    'Coherence Score': topic_coherence,
    
    
})
coherence_df.to_csv('topic_coherence_scores.csv', index=False)


Topic 0: Coherence Score: -0.28769577215415804
Topic 1: Coherence Score: 0.09946800852953795
Topic 2: Coherence Score: 0.13557907349238377
Topic 3: Coherence Score: 0.004558081229387434
Topic 4: Coherence Score: -0.201355145250593
Topic 5: Coherence Score: -0.02752123389310678
Topic 6: Coherence Score: -0.36255221341109334
Topic 7: Coherence Score: -0.35314402354417346
Topic 8: Coherence Score: 0.012117381388664656
Topic 9: Coherence Score: 0.016085606927812062
Topic 10: Coherence Score: 0.1366860509401476
Topic 11: Coherence Score: 0.03392122017448562
Topic 12: Coherence Score: 0.0466676233737671
Topic 13: Coherence Score: 0.08795086216816345
Topic 14: Coherence Score: -0.29969265989988564
Topic 15: Coherence Score: -0.34002942632576566
Topic 16: Coherence Score: 0.12389529994687908
Topic 17: Coherence Score: 0.03544586548600742
Topic 18: Coherence Score: 0.07773925314751409
Topic 19: Coherence Score: -0.29768741181923314
