In [None]:
import pandas as pd
import scipy.sparse as ss
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import vis_topic as vt
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import MinMaxScaler


%matplotlib inline

In [None]:
def dummy(doc):
    return doc

In [None]:
CountV_model = CountVectorizer(analyzer=dummy)

In [None]:
final_merged_df = pd.read_pickle("/Users/kellycoulter/Desktop/PhD_Code_2021/final_merged_df.pkl")

final_merged_df

texts = final_merged_df['Cleaned Tokens']

In [None]:
final_merged_df

In [None]:
list_text = texts.to_list()

In [None]:
vect = CountVectorizer(binary=True, analyzer=dummy)

In [None]:
matrix = vect.fit_transform(list_text)

In [None]:
topic_model = ct.Corex(n_hidden=18)

In [None]:
topic_model.fit(matrix, words=vect.get_feature_names())

In [None]:
topics = topic_model.get_topics()
for topic_n, topic in enumerate(topics):
    words, mis = zip(*topic)
    topic_str = str(topic_n+1)+': '+','.join(words)
    print(topic_str)

In [None]:
type(docs)

In [None]:
top_docs = topic_model.get_top_docs()


In [None]:
top_docs = topic_model.get_top_docs()
for topic_n, topic_docs in enumerate(top_docs):
    docs,probs = zip(*topic_docs)
    topic_str = str(topic_n+1)+": "+",".join(map(str, docs))
    print(topic_str)

In [None]:
data = []
chosen_topics = [15,17,12]
n_docs=15
for topic in chosen_topics:
    topic_documents = topic_model.get_top_docs(n_docs=n_docs, topic=topic)
    indexes = [(index, topic) for index, score in topic_documents]
    data.extend(indexes)

In [None]:
topic_filter = pd.DataFrame(data, columns = ["index", "topic_number"]).set_index("index")

In [None]:
top_topic_df = topic_filter.merge(final_merged_df.reset_index(), left_index=True, right_index=True, how="left")

In [None]:
top_topic_df

In [None]:
topic_model.labels

In [None]:
(top_docs[0])

In [None]:
#Topic TC-The overall total correlation is the sum of the total correlation per each topic. 
#These can be accessed through tcs. 
#For an unsupervised CorEx topic model, the topics are always sorted from high to low according to their TC. 
topic_model.tcs.shape # k_topics

In [None]:
print(topic_model.clusters)
print(topic_model.clusters.shape) # m_words

In [None]:
# Print a single topic from CorEx topic model
topic_model.get_top_docs(topic=2, n_docs=10, sort_by='log_prob')

In [None]:
#CorEx is a discriminative model, whereas LDA is a generative model. 
#This means that while LDA outputs a probability distribution over each document, 
#CorEx instead estimates the probability a document belongs to a topic given that document's words. 
#As a result, the probabilities across topics for a given document do not have to add up to 1. 
#The estimated probabilities of topics for each document can be accessed through log_p_y_given_x or p_y_given_x.
print(topic_model.p_y_given_x.shape) # n_docs x k_topics


In [None]:
#We can also use a softmax to make a binary determination of which documents belong to each topic. 
#These softmax labels can be accessed through labels.
print(topic_model.labels.shape) # n_docs x k_topics

In [None]:
pd.DataFrame(topic_model.labels).sum().sort_values().plot.bar()

In [None]:
label_df = pd.DataFrame(topic_model.labels)*1

In [None]:
label_df

In [None]:
Scaler = MinMaxScaler()

In [None]:
cooc = label_df.T.dot(label_df)
np.fill_diagonal(cooc.values, 0)
cooc.style.background_gradient(cmap="viridis")

In [None]:
cooc = label_df.T.dot(label_df)
np.fill_diagonal(cooc.values, 0)
cooc = pd.DataFrame(Scaler.fit_transform(cooc))
cooc.style.background_gradient(cmap="viridis")

In [None]:
pd.DataFrame(topic_model.p_y_given_x)

In [None]:
pd.DataFrame(topic_model.p_y_given_x).mean()

In [None]:
#Total correlation is the measure which CorEx maximize when constructing the topic model. 
#It can be accessed through tc and is reported in nats.
topic_model.tc

In [None]:
print(np.sum(topic_model.tcs))
print(topic_model.tc)

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);

In [None]:
#https://github.com/gregversteeg/corex_topic/blob/master/corextopic/example/corex_topic_example.ipynb

In [None]:
top_topic_df[]