In [None]:
###### Required libraries ######
import pandas as pd
from pprint import pprint
import numpy as np
# Natural language tools
import nltk
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora.dictionary import Dictionary
# Plotting tools
# import pyLDAvis
# import pyLDAvis.gensim  
import matplotlib.pyplot as plt
# %matplotlib inline

In [None]:
###### Read data from the file ######
pd.options.display.max_colwidth = 800
path_ee = "/home/clausewitz/Downloads/equal_entries.csv"
ddf_EE_LDA = pd.read_csv(path_ee, sep = ',')
ddf_EE_LDA = pd.DataFrame(data = ddf_EE_LDA)
ddf_EE_LDA = ddf_EE_LDA.loc[:, ~ddf_EE_LDA.columns.str.contains('^Unnamed')]

In [None]:
ddf_EE_LDA.entry = ddf_EE_LDA.entry.str.replace(r'\b(\w{1,3})\b', '')

In [None]:
#ddf_EE_LDA = ddf_EE_LDA.drop(ddf_EE_LDA.columns[[0, 2, 3, 4, 5, 6, 7, 8, 9]], axis=1)
#cols_ee = ddf_EE_LDA.columns.tolist()

In [None]:
ddf_EE_LDA.entry = pd.Series([[y[:5] for y in x.split()] for x in ddf_EE_LDA.entry.tolist()])

In [None]:
#ddf_EE_LDA = ddf_EE_LDA.drop(ddf_EE_LDA.columns[[8, 9]], axis=1)

In [None]:
##### This is used as the input by the LDA model. #####
# Create Dictionary
id2word = corpora.Dictionary(ddf_EE_LDA.entry)
# Create Corpus
texts = ddf_EE_LDA.entry
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in ddf_EE_LDA.entry]
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus]

In [None]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = '/home/clausewitz/Downloads/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

In [None]:
ldamallet[corpus]

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics,
                                                 id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        texts=ddf_EE_LDA.entry, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 8))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3] 
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words = 10))

In [None]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("/home/clausewitz/optimal_model")
ldamallet.save(temp_file)
# Load a potentially pretrained model from disk.
# lda = LdaModel.load(temp_file)

In [None]:
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=ddf_EE_LDA.entry):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4),
                                                                  topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=ddf_EE_LDA.entry)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Entry']

# Show
df_dominant_topic.head(5)

In [None]:
df_dominant_topic.Keywords.unique()

In [None]:
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[0],
                      'Keywords'] = 'Ulaşım'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[1],
                      'Keywords'] = 'X'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[2],
                      'Keywords'] = 'Ekonomi'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[3],
                      'Keywords'] = 'Yemek'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[4],
                      'Keywords'] = 'Y'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[5],
                      'Keywords'] = 'Siyaset'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[6],
                      'Keywords'] = 'Sosyal Medya'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[7],
                      'Keywords'] = 'Z'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[8],
                      'Keywords'] = 'Haber'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[9],
                      'Keywords'] = 'İlişki'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[10],
                      'Keywords'] = 'B'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[11],
                      'Keywords'] = 'C'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[12],
                      'Keywords'] = 'Hayvan'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[13],
                      'Keywords'] = 'Türkiye'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[14],
                      'Keywords'] = 'D'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[15],
                      'Keywords'] = 'Eğitim'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[16],
                      'Keywords'] = 'Sağlık'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[17],
                      'Keywords'] = 'Sanat'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[18],
                      'Keywords'] = 'Futbol'
df_dominant_topic.loc[df_dominant_topic['Keywords'] == df_dominant_topic.Keywords.unique()[19],
                      'Keywords'] = 'G'

In [None]:
df_dominant_topic

In [None]:
ddf_dominant_topic = df_dominant_topic[['Entry','Keywords']]

In [None]:
ddf_dominant_topic

In [None]:
ddf_EE_LDA

In [None]:
df = pd.concat([ddf_EE_LDA, ddf_dominant_topic], axis=1, sort=False)

In [None]:
df

In [None]:
df = df.drop(df.columns[[0, 1,8,9]], axis=1)

In [None]:
df

In [None]:
df.rename(columns={'Keywords':'Topic'}, inplace=True)
df.rename(columns={'type':'Type'}, inplace=True)
df.rename(columns={'typeClass':'TypeClass'}, inplace=True)

In [None]:
df

In [None]:
columnsTitles = ['Entry', 'Topic', 'Type', 'TypeClass', 'E', 'S', 'T', 'J']
df = df.reindex(columns=columnsTitles)

In [None]:
df