In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%config InlineBackend.figure_format = 'svg'
plt.style.use('fivethirtyeight')
import pickle
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import datapath
import pyLDAvis
import pyLDAvis.gensim



In [None]:
final_merged_df = pd.read_pickle('/Users/kellycoulter/Desktop/PhD_Code_2021/final_merged_df.pkl')

final_merged_df

texts = final_merged_df['Cleaned Tokens']

vocab = corpora.Dictionary(texts) # id2word
corpus = [vocab.doc2bow(s) for s in texts] #topic probability distribution shape-convert tokenized docs to vectors
#the function doc2bow counts no of occurences of each distinct word, converts the word to its integer word id and returns result as sparse vector


In [None]:
#set up and run LDA model over 20 topics and calculate coherence score-store in dictionary
min_k = 2
max_k = 21
intervals = 1
coherence = {}

for i in range(min_k, max_k, intervals):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
    id2word=vocab,
    num_topics=i,
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=vocab) 
    coherence_lda = coherence_model_lda.get_coherence()
    coherence[i] = coherence_lda
    print('Coherence Score for {} Topics:'.format(i), coherence_lda)

In [None]:
#plot coherence scores on graph
sm = pd.DataFrame.from_dict(coherence, orient='index', columns=['Coherence'])
sm['Topics'] = sm.index

plt.plot(sm.Topics, sm.Coherence)
ticks = plt.xticks(np.arange(min(sm.Topics), max(sm.Topics)+1, 1))
plt.savefig('img/coherence.pdf')

In [None]:
#provide range of alphas
alphas = list(np.arange(0.01, 1, 0.3))
print('alpha values: ', alphas)

#provide range of betas
betas = list(np.arange(0.01, 1, 0.3))
print('beta values: ', alphas)

In [None]:
#Selected 14 topics as highest coherence score, now run lda model passing in default alpha and beta values calculating coherence
n_topics_14 = 14
evaluations_14 = []
for alpha in alphas:
    for beta in betas:
        lda = gensim.models.LdaMulticore(corpus=corpus,
            id2word=vocab,
            num_topics=n_topics_14,
            random_state=100,
            chunksize=100,
            passes=10,
            alpha=alpha,
            eta=beta)
        coherence = CoherenceModel(model=lda, texts=texts, dictionary=vocab, coherence='c_v')
        scores = [alpha, beta, coherence.get_coherence()]
        evaluations_14.append(scores)
        print("Finished a model, moving onto the next one...")

In [None]:
#dataframe of alpha, beta and coherence evaluations in 14 topic model
evals_14 = pd.DataFrame(evaluations_14, columns = ['alpha', 'beta', 'coherence'])
print(evals_14.sort_values('coherence', ascending=False))

In [None]:
#plot alpha and beta against coherence score in 14 topic model
plt.style.use('classic')
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(xs=evals_14['alpha'],ys=evals_14['beta'],zs=evals_14['coherence'], marker='o', s=23, color='maroon')
ax.tick_params(axis='both', which='major', labelsize=8)
ax.set_xlabel(r'$\alpha$', fontsize=20)
ax.set_ylabel(r'$\beta$', fontsize=20)
ax.set_zlabel('Coherence', fontsize=12)
plt.savefig('img/parameters14.pdf')
plt.style.use('fivethirtyeight')

In [None]:
#using the highest alpha and beta scores run 14 topic model and show topics
lda = gensim.models.LdaMulticore(corpus=corpus,
id2word=vocab,
num_topics=n_topics_14,
random_state=100,
chunksize=100,
passes=10,
alpha=.91,
eta=.91)
for topic in lda.print_topics():
    print('TOPIC {} | {}'.format(topic[0], topic[1]), '\n')

In [None]:
#using domain knowledge the above results display junk topics - topic 0, 10, 12, 13 junk topics

In [None]:
#next will try second highest coherence score which is 11 topics

In [None]:
#Selected 11 topics as 2nd highest coherence score, now run lda model passing in default alpha and beta values calculating coherence
n_topics_11 = 11
evaluations_11 = []
for alpha in alphas:
    for beta in betas:
        lda = gensim.models.LdaMulticore(corpus=corpus,
            id2word=vocab,
            num_topics=n_topics_11,
            random_state=100,
            chunksize=100,
            passes=10,
            alpha=alpha,
            eta=beta)
        coherence = CoherenceModel(model=lda, texts=texts, dictionary=vocab, coherence='c_v')
        scores = [alpha, beta, coherence.get_coherence()]
        evaluations_11.append(scores)
        print("Finished a model, moving onto the next one...")

In [None]:
#dataframe of alpha, beta and coherence evaluations in 11 topic model
evals_11 = pd.DataFrame(evaluations_11, columns = ['alpha', 'beta', 'coherence'])
print(evals_11.sort_values('coherence', ascending=False))

In [None]:
#plot alpha and beta against coherence score in 11 topic model
plt.style.use('classic')
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(xs=evals_11['alpha'],ys=evals_11['beta'],zs=evals_11['coherence'], marker='o', s=23, color='maroon')
ax.tick_params(axis='both', which='major', labelsize=8)
ax.set_xlabel(r'$\alpha$', fontsize=20)
ax.set_ylabel(r'$\beta$', fontsize=20)
ax.set_zlabel('Coherence', fontsize=12)
plt.savefig('img/parameters11.pdf')
plt.style.use('fivethirtyeight')

In [None]:
#using the highest alpha and beta scores run 11 topic model and show topics
lda = gensim.models.LdaMulticore(corpus=corpus,
id2word=vocab,
num_topics=n_topics_11,
random_state=100,
chunksize=100,
passes=10,
alpha=.91,
eta=.91)
for topic in lda.print_topics():
    print('TOPIC {} | {}'.format(topic[0], topic[1]), '\n')

In [None]:
#next will try third highest coherence score which is 18 topics

In [None]:
#Selected 18 topics as 3rd highest coherence score, now run lda model passing in default alpha and beta values calculating coherence
n_topics_18 = 18
evaluations_18 = []
for alpha in alphas:
    for beta in betas:
        lda = gensim.models.LdaMulticore(corpus=corpus,
            id2word=vocab,
            num_topics=n_topics_18,
            random_state=100,
            chunksize=100,
            passes=10,
            alpha=alpha,
            eta=beta)
        coherence = CoherenceModel(model=lda, texts=texts, dictionary=vocab, coherence='c_v')
        scores = [alpha, beta, coherence.get_coherence()]
        evaluations_18.append(scores)
        print("Finished a model, moving onto the next one...")

In [None]:
#dataframe of alpha, beta and coherence evaluations in 18 topic model
evals_18 = pd.DataFrame(evaluations_18, columns = ['alpha', 'beta', 'coherence'])
print(evals_18.sort_values('coherence', ascending=False))

In [None]:
#plot alpha and beta against coherence score in 18 topic model
plt.style.use('classic')
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(xs=evals_18['alpha'],ys=evals_18['beta'],zs=evals_18['coherence'], marker='o', s=23, color='maroon')
ax.tick_params(axis='both', which='major', labelsize=8)
ax.set_xlabel(r'$\alpha$', fontsize=20)
ax.set_ylabel(r'$\beta$', fontsize=20)
ax.set_zlabel('Coherence', fontsize=12)
plt.savefig('img/parameters18.pdf')
plt.style.use('fivethirtyeight')

In [None]:
#using the highest alpha and beta scores run 18 topic model and show topics
lda_18 = gensim.models.LdaMulticore(corpus=corpus,
id2word=vocab,
num_topics=n_topics_18,
random_state=100,
chunksize=100,
passes=10,
alpha=.91,
eta=.91)
for topic in lda.print_topics():
    print('TOPIC {} | {}'.format(topic[0], topic[1]), '\n')

In [None]:
#save model
temp_lda_file = datapath('/Users/kellycoulter/Desktop/PhD_Code_2021/lda_18_model')

lda.save(temp_lda_file)

In [None]:
lda=gensim.models.LdaMulticore.load("lda_18_model")

In [None]:
#chosen 18 topic model-seems appropriate based on qualitative overview of related topics to crypto based on relevancy and representation
#visualise 18 topic 
vis = pyLDAvis.gensim.prepare(lda, corpus, vocab, mds='tsne')
pyLDAvis.save_html(vis, 'img/lda_18.html')

In [None]:
data_vectorized = corpus

In [None]:
# Create Document — Topic Matrix
lda_output = final_merged_df['Cleaned Tokens'].transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(final.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
final_merged_df