In [39]:
import pandas as pd
import gensim
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import spacy

In [30]:
# Load the data
data = pd.read_csv('../data/cleaned_pyt_posts.csv')
data.head()

Unnamed: 0,Id,PostTypeId,ParentId,Title,Body,Score,Tags,ViewCount,FavoriteCount
0,34750268,1,,Extracting top k value indices 1 Tensor,p Given 1 tensor Torch containing values compa...,9,python lua pytorch torch,12840 0,0 0
1,38543850,1,,Display Custom Images Tensorboard e g Matplotl...,p href github com tensorflow tensorflow blob m...,40,python tensorflow matplotlib pytorch tensorboard,42232 0,0 0
2,41461670,1,,cudnnRNNForwardTraining seqLength xDesc usage,p Let say N sequences x length seqLength 0 lt ...,7,cudnn,842 0,0 0
3,41767005,1,,Python wheels cp27mu supported,p trying install pytorch href http pytorch org...,11,python linux unicode pytorch,5358 0,0 0
4,41818618,1,,PyTorch import installing Anaconda,p installed PyTorch installing Anaconda run iP...,0,macos python 3 x ipython anaconda pytorch,1593 0,0 0


In [41]:
# Create bigram and trigram models
bigram = Phrases(data['Body'], min_count=5, threshold=100)
trigram = Phrases(bigram[data['Body']], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


In [40]:
# Function to preprocess text with bigrams, trigrams, lemmatization, and tokenization
def preprocess(text):
    # Tokenize text
    tokens = simple_preprocess(text, deacc=True, min_len=2)
    
    # Apply bigram and trigram models
    tokens = bigram_mod[tokens]
    tokens = trigram_mod[bigram_mod[tokens]]
    
    # Perform lemmatization
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    
    return tokens

In [42]:
# Preprocess text data
data['preprocessed_text'] = data['Body'].apply(preprocess)

# Create Dictionary
id2word = Dictionary(data['preprocessed_text'])

# Create Corpus
corpus = [id2word.doc2bow(text) for text in data['preprocessed_text']]

# Define a range of topic numbers to try
topic_nums = [5, 10, 15, 20]

In [43]:
# Perform LDA modeling with different topic numbers
results = []
lda_models = []

for num_topics in topic_nums:
    # Build LDA model
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       update_every=1,
                                       chunksize=100,
                                       passes=10,
                                       alpha='auto',
                                       per_word_topics=True)
    lda_models.append(lda_model)  # Store the LDA model for later use

    # Compute Perplexity
    perplexity = lda_model.log_perplexity(corpus)
    print('Number of Topics:', num_topics)
    print('Perplexity:', perplexity)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data['preprocessed_text'],
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print('Coherence Score:', coherence_score)

    # Save the topics to a results list
    topics = lda_model.show_topics(num_topics=num_topics, num_words=10)
    topic_words = [topic[1] for topic in topics]
    results.append({'Num Topics': num_topics, 'Perplexity': perplexity, 'Coherence Score': coherence_score, 'Topic Words': topic_words})

# Save the results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('lda_results.csv', index=False)

# Generate visualization for the best LDA model
best_model = lda_models[results_df['Perplexity'].idxmin()]
vis_data = gensimvis.prepare(best_model, corpus, id2word)
pyLDAvis.save_html(vis_data, 'lda_visualization.html')

Number of Topics: 5
Perplexity: -6.825232996997735
Coherence Score: 0.5114340442808794
Number of Topics: 10
Perplexity: -7.138583534076263
Coherence Score: 0.4533789496994496
Number of Topics: 15
Perplexity: -9.604643100397874
Coherence Score: 0.43103740463052254
Number of Topics: 20
Perplexity: -10.987087707233444
Coherence Score: 0.4431947061613991
