In [None]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
!pip install gensim


In [None]:
data = pd.read_excel('for_unsupervised.xlsx', sheet_name='Sheet1')
data

In [None]:
# Preprocessing: tokenize and clean the text
def preprocess(text):
    return [word for word in gensim.utils.simple_preprocess(text) if word not in STOPWORDS]

In [None]:
data['processed_text'] = data['inc_short_description'].apply(preprocess)

In [None]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(data['processed_text'])

In [None]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]
corpus

In [None]:
# Automatic determination of the number of topics using coherence score
coherence_scores = []
for num_topics in range(15, 60):  # Try a range of possible topics
    lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=15)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data['processed_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    coherence_scores.append((num_topics, coherence_score))

In [None]:
# Find the number of topics with the highest coherence score
optimal_num_topics = max(coherence_scores, key=lambda x: x[1])[0]

In [None]:
# Train the final LDA model with the optimal number of topics
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=optimal_num_topics, passes=15)

In [None]:
# Print the topics and their top words
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
data['topic_distribution'] = [lda_model.get_document_topics(doc) for doc in corpus]

In [None]:
# Convert the topic distribution to a more readable format
data['topics'] = [", ".join([f"Topic {topic}: {prob:.2f}" for topic, prob in doc_topics]) for doc_topics in data['topic_distribution']]

In [None]:
# Save the results to a CSV file
data.to_csv('topic_results.csv', index=False)

In [None]:
import pickle

In [None]:
with open('lda_model.pickle','wb') as file:
    pickle.dump(lda_model, file)