In [1]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
!pip install gensim




In [3]:
data = pd.read_excel('for_unsupervised.xlsx', sheet_name='Sheet1')
data

Unnamed: 0,inc_number,inc_short_description,inc_close_notes,class
0,INC11357102,I can't access hqchvmccapp01.nespresso.com ï¼Œ...,undefined,
1,INC11478672,PC1 - NesSoft Incident Request Form,undefined,
2,INC11406569,EUR RTF PREPROD Service is down.,Closure: We have restarted the server as per t...,
3,INC11546163,[L2] - Incident - Sprinklr - Nestle Global - U...,L3 collaboration_x000D_\n_x000D_\n1. Initial L...,
4,INC11445939,[L2] - Incident - Sprinklr - Nestle Global - ...,Able to reproduce the issue? Yes _x000D_\n_x00...,
...,...,...,...,...
54075,INC12187966,HO - EgyÃ©b kÃ©rÃ©s [Other request],"hello,_x000D_\nvan mÃ¡r rÃ¡ tikett, mert ez eg...",
54076,INC11994052,HO - KÃ¶nyvtÃ¡r hozzÃ¡fÃ©rÃ©s vagy tulajdonos ...,"hello,_x000D_\n_x000D_\nebben nem az IT az ill...",
54077,INC12098344,I don't have access to Spaceman program,"hello,_x000D_\n_x000D_\nEbben nem az IT tud ne...",
54078,INC12033225,CH-QPM-First login,"Dear Arbjana, we have reset your password and ...",


In [4]:
# Preprocessing: tokenize and clean the text
def preprocess(text):
    return [word for word in gensim.utils.simple_preprocess(text) if word not in STOPWORDS]

In [5]:
data['processed_text'] = data['inc_short_description'].apply(preprocess)

In [6]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(data['processed_text'])

In [7]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1)],
 [(7, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)],
 [(7, 1), (19, 1), (22, 1), (24, 1), (26, 1), (27, 1), (28, 1), (29, 1)],
 [(4, 1),
  (7, 1),
  (24, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(7, 1),
  (19, 1),
  (22, 1),
  (24, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1)],
 [(7, 1),
  (18, 1),
  (19, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (39, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2)],
 [(7, 1),
  (19, 1),
  (22, 1),
  (24, 1),
  (25, 1),
  (41, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1)],
 [(55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1)],
 [(62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
 

In [12]:
# Automatic determination of the number of topics using coherence score
coherence_scores = []
for num_topics in range(15, 60):  # Try a range of possible topics
    lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=15)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data['processed_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    coherence_scores.append((num_topics, coherence_score))

In [9]:
# Find the number of topics with the highest coherence score
optimal_num_topics = max(coherence_scores, key=lambda x: x[1])[0]

In [10]:
# Train the final LDA model with the optimal number of topics
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=optimal_num_topics, passes=15)

In [11]:
# Print the topics and their top words
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(30, '0.412*"creation" + 0.104*"mx" + 0.083*"roles" + 0.060*"sales" + 0.035*"adm"')
(42, '0.288*"sfat" + 0.123*"wrong" + 0.066*"load" + 0.031*"tax" + 0.025*"rd"')
(19, '0.317*"asset" + 0.149*"tag" + 0.069*"assign" + 0.068*"information" + 0.053*"edit"')
(50, '0.108*"problem" + 0.082*"digital" + 0.079*"bar" + 0.078*"uki" + 0.073*"demandhub"')
(12, '0.242*"nestle" + 0.172*"com" + 0.084*"group" + 0.080*"able" + 0.072*"code"')
(9, '0.098*"orders" + 0.090*"payment" + 0.060*"vnhcm" + 0.046*"aoa" + 0.043*"il"')
(29, '0.232*"error" + 0.195*"ho" + 0.094*"nespresso" + 0.081*"report" + 0.066*"prod"')
(24, '0.275*"windows" + 0.146*"id" + 0.126*"file" + 0.043*"au" + 0.039*"pod"')
(44, '0.400*"application" + 0.122*"shopping" + 0.040*"restore" + 0.039*"blocked" + 0.034*"lost"')
(18, '0.110*"dsp" + 0.105*"app" + 0.103*"demand" + 0.085*"bw" + 0.061*"tp"')
(32, '0.141*"access" + 0.139*"fã" + 0.138*"hozzã" + 0.070*"docusign" + 0.054*"rã"')
(49, '0.161*"ams" + 0.090*"date" + 0.070*"job" + 0.062*"close" + 0

In [13]:
data['topic_distribution'] = [lda_model.get_document_topics(doc) for doc in corpus]

In [14]:
# Convert the topic distribution to a more readable format
data['topics'] = [", ".join([f"Topic {topic}: {prob:.2f}" for topic, prob in doc_topics]) for doc_topics in data['topic_distribution']]

In [15]:
# Save the results to a CSV file
data.to_csv('topic_results.csv', index=False)

In [16]:
import pickle

In [17]:
with open('lda_model.pickle','wb') as file:
    pickle.dump(lda_model, file)