In [88]:
import pickle
import pandas as pd
from top2vec import Top2Vec
import numpy as np
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.utils import tokenize
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

from dataframe_generator_1 import d2v_pandas_generator, n2v_df
from clustering_2 import clustering
from topic_representation_3 import ctfidf_f,topic_dictionary,doc2word,ctf_idf_topics
from diversity_metrics_4 import *



In [3]:
#################################
## Read Data and Doc2Vec Model ##
#################################

references_df = pd.read_pickle("pandas_data.pkl")
d2v_model = Top2Vec.load("global_topic_model")

In [4]:
###################################################
### Import citation graph and Node2Vec Model ######
###################################################

#G=nx.read_graphml("citation_graph.gz")

#node2vec = Node2Vec(G, dimensions=100, walk_length=30, num_walks=3000, workers=4)
#n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)
#node2vec.save("embeddings.model")

n2v_model = Word2Vec.load("embeddings.model")

In [14]:
###############################################################
### Generate Pandas DataFrame with Documents embeddings ######
###############################################################

df=d2v_pandas_generator(d2v_model,references_df)
df=df[["d2v","id"]]
df["n2v"]=df.apply(lambda row: n2v_df(row["id"]), axis=1)
df=df.dropna()
df["c2v"]=df.apply(lambda row: np.concatenate((row['d2v'], row['n2v']), axis=None), axis=1)
df=df.reset_index()
df=df[["id","d2v","n2v","c2v"]]

In [15]:
#SAVING DATAFrame
df.to_pickle("1-embeddings_df.pkl")

################
###  Clustering
################

In [22]:
#####################################
### Prep Data for Clusterings #######
#####################################

#split_c2v_df = pd.DataFrame(df["c2v"].tolist())
#split_c2v_df.to_pickle("2-split_c2v_df.pkl")
split_c2v_df = pd.read_pickle("2-split_c2v_df.pkl")

#split_n2v_df = pd.DataFrame(df["n2v"].tolist())
#split_n2v_df.to_pickle("2-split_n2v_df.pkl")
split_n2v_df = pd.read_pickle("2-split_n2v_df.pkl")

#split_d2v_df = pd.DataFrame(df["d2v"].tolist())
#split_d2v_df.to_pickle("2-split_d2v_df.pkl")
split_d2v_df = pd.read_pickle("2-split_d2v_df.pkl")

In [24]:
#####################################
### Clustering the embeddings #######
#####################################

umap_args = {'n_neighbors': 15,
                         'n_components': 5,
                         'metric': 'cosine'}

hdbscan_args = {'min_cluster_size': 15,
                    'metric': 'euclidean',
                    'cluster_selection_method': 'eom'}

cluster_c2v=clustering(split_c2v_df,umap_args,hdbscan_args)
cluster_n2v=clustering(split_n2v_df,umap_args,hdbscan_args)
cluster_d2v=clustering(split_d2v_df,umap_args,hdbscan_args)

In [26]:
#########################################
### Clustering to Pandas DataFrame ######
#########################################
cluster_c2v_labels=pd.DataFrame(cluster_c2v.labels_)
cluster_n2v_labels=pd.DataFrame(cluster_n2v.labels_)
cluster_d2v_labels=pd.DataFrame(cluster_d2v.labels_)

cluster_c2v_labels.to_pickle("3-clustered_split_c2v_df.pkl")
cluster_n2v_labels.to_pickle("3-clustered_split_n2v_df.pkl")
cluster_d2v_labels.to_pickle("3-clustered_split_d2v_df.pkl")

In [31]:
#PRINTING THE SIZE OF CLUSTERED
print("number of clusters in d2v embeddings are:",cluster_d2v_labels.max()[0])
print("number of clusters in n2v embeddings are:",cluster_n2v_labels.max()[0])
print("number of clusters in proposed embeddings are:",cluster_c2v_labels.max()[0])

number of clusters in d2v embeddings are: 665
number of clusters in n2v embeddings are: 1229
number of clusters in proposed embeddings are: 738


In [39]:
#JOINING and SAVING clustered_embeddings DataFrame
dt = df.join(cluster_c2v_labels).rename(columns={0: "cluster_c2v"})
dt = dt.join(cluster_d2v_labels).rename(columns={0: "cluster_d2v"})
dt = dt.join(cluster_n2v_labels).rename(columns={0: "cluster_n2v"})
dt.to_pickle("3-clustered_embeddings_df.pkl")

#########################################
### TOPIC REPRESENTATION 1: TF-IDF #########
#########################################

In [43]:

docs_c2v=dt[["id","cluster_c2v"]].merge(references_df[["id","abstract"]],how='inner', on="id")
docs_c2v=docs_c2v.dropna()
docs_c2v=docs_c2v[docs_c2v["cluster_c2v"]!=-1]

docs_d2v=dt[["id","cluster_d2v"]].merge(references_df[["id","abstract"]],how='inner', on="id")
docs_d2v=docs_d2v.dropna()
docs_d2v=docs_d2v[docs_d2v["cluster_d2v"]!=-1]

docs_n2v=dt[["id","cluster_n2v"]].merge(references_df[["id","abstract"]],how='inner', on="id")
docs_n2v=docs_n2v.dropna()
docs_n2v=docs_n2v[docs_n2v["cluster_n2v"]!=-1]

c2v_docs_per_class= docs_c2v.groupby(['cluster_c2v'], as_index=False).agg({'abstract': ' '.join})
d2v_docs_per_class= docs_d2v.groupby(['cluster_d2v'], as_index=False).agg({'abstract': ' '.join})
n2v_docs_per_class= docs_n2v.groupby(['cluster_n2v'], as_index=False).agg({'abstract': ' '.join})

In [57]:
words_c2v,ctfidf_c2v=ctfidf_f(c2v_docs_per_class,len(docs_c2v))
words_d2v,ctfidf_d2v=ctfidf_f(d2v_docs_per_class,len(docs_d2v))
words_n2v,ctfidf_n2v=ctfidf_f(n2v_docs_per_class,len(docs_n2v))

In [58]:
topics_ctfidf_c2v=ctf_idf_topics(c2v_docs_per_class.cluster_c2v,words_c2v,ctfidf_c2v,10)
topics_ctfidf_d2v=ctf_idf_topics(d2v_docs_per_class.cluster_d2v,words_d2v,ctfidf_d2v,10)
topics_ctfidf_n2v=ctf_idf_topics(n2v_docs_per_class.cluster_n2v,words_n2v,ctfidf_n2v,10)

In [81]:
tr1=[topics_ctfidf_c2v,topics_ctfidf_d2v,topics_ctfidf_n2v]

In [63]:

with open('4-c2v_ctfidf-topics.pkl', 'wb') as f:
    pickle.dump(topics_ctfidf_c2v, f)
with open('4-d2v_ctfidf-topics.pkl', 'wb') as f:
    pickle.dump(topics_ctfidf_d2v, f)
with open('4-n2v_ctfidf-topics.pkl', 'wb') as f:
    pickle.dump(topics_ctfidf_n2v, f)

In [64]:
# Get similar categories

similarity_c2v_ctfidf = cosine_similarity(ctfidf_c2v, ctfidf_c2v)
similarity_d2v_ctfidf = cosine_similarity(ctfidf_d2v, ctfidf_d2v)
similarity_n2v_ctfidf = cosine_similarity(ctfidf_n2v, ctfidf_n2v)

np.fill_diagonal(similarity_c2v_ctfidf, 0)
np.fill_diagonal(similarity_d2v_ctfidf, 0)
np.fill_diagonal(similarity_n2v_ctfidf, 0)

#result = pd.DataFrame([(index, distances[index].argmax(),distances[index].max()) for index in range(len(docs_per_class))],columns=["From", "To","Similarity"])
#result[result["Similarity"]>0.8].groupby("To")["From"].apply(list)
# result.head(5).values.tolist()


######################################################################################
### TOPIC REPRESENTATION 2: Calculating Centroids in Original Dimensional Space ######
######################################################################################

In [70]:
ids_per_class_c2v= docs_c2v.groupby("cluster_c2v")["id"].apply(list).reset_index()
ids_per_class_d2v= docs_d2v.groupby("cluster_d2v")["id"].apply(list).reset_index()
ids_per_class_n2v= docs_n2v.groupby("cluster_n2v")["id"].apply(list).reset_index()

In [71]:
rev_dic_ids,dic_ids=topic_dictionary(d2v_model,references_df)

In [73]:
ids_per_class_c2v["topic_c2v"]=ids_per_class_c2v.apply(lambda row: doc2word(d2v_model,dic_ids,row["id"],10), axis=1)
ids_per_class_d2v["topic_d2v"]=ids_per_class_d2v.apply(lambda row: doc2word(d2v_model,dic_ids,row["id"],10), axis=1)
ids_per_class_n2v["topic_n2v"]=ids_per_class_n2v.apply(lambda row: doc2word(d2v_model,dic_ids,row["id"],10), axis=1)

In [77]:
list_topics_c2v=list(ids_per_class_c2v["topic_c2v"])
list_topics_d2v=list(ids_per_class_d2v["topic_d2v"])
list_topics_n2v=list(ids_per_class_n2v["topic_n2v"])

In [80]:
tr2=[list_topics_c2v,list_topics_d2v,list_topics_n2v]

######################################################################################
### Validation 1: TOPIC Diversity (TD) ######
######################################################################################

In [84]:
print("proportion_unique_words for CTF-IDF:")
print("proposed:",proportion_unique_words(tr1[0], topk=10),"d2v:",proportion_unique_words(tr1[1], topk=10),"n2v:",proportion_unique_words(tr1[2], topk=10))
print("proportion_unique_words for Centroids:")
print("proposed:",proportion_unique_words(tr2[0], topk=10),"d2v:",proportion_unique_words(tr2[1], topk=10),"n2v:",proportion_unique_words(tr2[2], topk=10))

proportion_unique_words for CTF-IDF:
proposed: 0.7294993234100136 d2v: 0.6734234234234234 n2v: 0.6884552845528455
proportion_unique_words for Centroids:
proposed: 0.5721244925575102 d2v: 0.5213213213213214 n2v: 0.37154471544715445


In [85]:
print("pairwise_jaccard_diversity for CTF-IDF:")
print("proposed:",pairwise_jaccard_diversity(tr1[0], topk=10),"d2v:",pairwise_jaccard_diversity(tr1[1], topk=10),"n2v:",pairwise_jaccard_diversity(tr1[2], topk=10))
print("pairwise_jaccard_diversity for Centroids:")
print("proposed:",pairwise_jaccard_diversity(tr2[0], topk=10),"d2v:",pairwise_jaccard_diversity(tr2[1], topk=10),"n2v:",pairwise_jaccard_diversity(tr2[2], topk=10))

pairwise_jaccard_diversity for CTF-IDF:
proposed: 0.9981726020933082 d2v: 0.9794133801232939 n2v: 0.9990555717013562
pairwise_jaccard_diversity for Centroids:
proposed: 0.9977928426816666 d2v: 0.972345523138109 n2v: 0.9977374871391742


In [87]:
print("Inverted Rank-Biased Overlap (weight=0.5) for CTF-IDF:")
print("proposed:",irbo(tr1[0], weight=0.5, topk=10),"d2v:",irbo(tr1[1], weight=0.5, topk=10),"n2v:",irbo(tr1[2], weight=0.5, topk=10))
print("Inverted Rank-Biased Overlap (weight=0.5)for Centroids:")
print("proposed:",irbo(tr2[0], weight=0.5, topk=10),"d2v:",irbo(tr2[1], weight=0.5, topk=10),"n2v:",irbo(tr2[2], weight=0.5, topk=10))

Inverted Rank-Biased Overlap (weight=0.5) for CTF-IDF:
proposed: 0.9997594199260066 d2v: 0.9920952622757917 n2v: 0.9997987931136171
Inverted Rank-Biased Overlap (weight=0.5)for Centroids:
proposed: 0.9986411755640691 d2v: 0.9730698496829155 n2v: 0.9986577188997542


In [86]:
print("Inverted Rank-Biased Overlap (weight=0.9) for CTF-IDF:")
print("proposed:",irbo(tr1[0],weight=0.9, topk=10),"d2v:",irbo(tr1[1],weight=0.9, topk=10),"n2v:",irbo(tr1[2],weight=0.9, topk=10))
print("Inverted Rank-Biased Overlap (weight=0.9)for Centroids:")
print("proposed:",irbo(tr2[0],weight=0.9, topk=10),"d2v:",irbo(tr2[1],weight=0.9, topk=10),"n2v:",irbo(tr2[2],weight=0.9, topk=10))

Inverted Rank-Biased Overlap (weight=0.9) for CTF-IDF:
proposed: 0.9983027121182894 d2v: 0.9814903422338506 n2v: 0.9990469250798938
Inverted Rank-Biased Overlap (weight=0.9)for Centroids:
proposed: 0.9974352306642534 d2v: 0.9685636404699142 n2v: 0.9974417292610084


In [None]:
#print("we-pd:", pairwise_word_embedding_distance(topics, wv, topk=10))
#print("we-cd:", centroid_distance(topics, wv, topk=10))
#print("we-irbo p=0.5:",word_embedding_irbo(topics,wv, weight=0.5, topk=10))
#print("we-irbo p=0.9:",word_embedding_irbo(topics,wv, weight=0.9, topk=10))

######################################################################################
### Validation 2: Topic Coherence (TC) ######
######################################################################################

In [89]:
def bigrammer(doc):
    sentence_stream = simple_preprocess(strip_tags(doc), deacc=True)
    return bigram_phraser[sentence_stream]

abstract_list=references_df["abstract"]
sentence_stream = [doc.split(" ") for doc in  abstract_list]
bigram = Phrases(sentence_stream, min_count=5, threshold=100, delimiter=b' ')
bigram_phraser = Phraser(bigram)

In [90]:
tokenized = [list(bigrammer(doc)) for doc in abstract_list]
id2word = corpora.Dictionary(tokenized)
corpus = [id2word.doc2bow(text) for text in tokenized]

In [91]:
cm_c2v = CoherenceModel(topics=tr2[0] ,texts=tokenized,
                    corpus=corpus, dictionary=id2word, coherence='c_v')

print("Model Coherence C_V for c2v is:{0}".format(cm_c2v.get_coherence()))

Model Coherence C_V for c2v is:0.6899732083622436


In [92]:
cm_d2v = CoherenceModel(topics=tr2[1] ,texts=tokenized,
                    corpus=corpus, dictionary=id2word, coherence='c_v')

print("Model Coherence C_V for d2v is:{0}".format(cm_d2v.get_coherence()))

Model Coherence C_V for d2v is:0.6226364892136502


In [93]:
cm_n2v = CoherenceModel(topics=tr2[2] ,texts=tokenized,
                    corpus=corpus, dictionary=id2word, coherence='c_v')

print("Model Coherence C_V for n2v is:{0}".format(cm_n2v.get_coherence()))

Model Coherence C_V for n2v is:0.6641412459989379
