In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [10]:
file = pd.read_table('/home/janneae/cns/steps/featurefiles/discretized_10_0.7_6.features', dtype=str)
df = file.drop(['Sample', 'Chr', 'NumRepeats'], axis = 1)
listedDf = df.values.tolist()
dirichlet_dict = corpora.Dictionary(listedDf)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in listedDf]

In [11]:
df

Unnamed: 0,CN,SegSize,Dist2Cent,SegVal,LOH,SizeDipSeg,CpCN,Dist2nCNV,GCcSeg
0,CN_3,SegSize_6,Dist2Cent_4,SegVal_4,LOH_0,SizeDipSeg_2,CpCN_2,Dist2CNV_6,GCcSeg_4
1,CN_3,SegSize_5,Dist2Cent_4,SegVal_6,LOH_0,SizeDipSeg_1,CpCN_5,Dist2CNV_2,GCcSeg_3
2,CN_3,SegSize_5,Dist2Cent_4,SegVal_3,LOH_0,SizeDipSeg_1,CpCN_3,Dist2CNV_4,GCcSeg_3
3,CN_2,SegSize_6,Dist2Cent_2,SegVal_1,LOH_0,SizeDipSeg_3,CpCN_2,Dist2CNV_2,GCcSeg_1
4,CN_2,SegSize_5,Dist2Cent_6,SegVal_2,LOH_0,SizeDipSeg_2,CpCN_5,Dist2CNV_3,GCcSeg_1
...,...,...,...,...,...,...,...,...,...
168405,CN_4,SegSize_6,Dist2Cent_1,SegVal_3,LOH_0,SizeDipSeg_1,CpCN_1,Dist2CNV_6,GCcSeg_3
168406,CN_5,SegSize_5,Dist2Cent_3,SegVal_5,LOH_0,SizeDipSeg_1,CpCN_2,Dist2CNV_5,GCcSeg_1
168407,CN_4,SegSize_6,Dist2Cent_1,SegVal_3,LOH_0,SizeDipSeg_1,CpCN_1,Dist2CNV_6,GCcSeg_6
168408,CN_5,SegSize_4,Dist2Cent_3,SegVal_5,LOH_0,SizeDipSeg_1,CpCN_3,Dist2CNV_1,GCcSeg_1


## Using 2-20 topics

In [12]:
# Considering 1-15 topics, as the last is cut off
num_topics = list(range(16)[1:])
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    LDA_models[i] = LdaModel.load(f'../results/gensim/lda/lda_t{i}_f10_b6.model')

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]

In [13]:
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [14]:
coherences = [CoherenceModel(model=LDA_models[i], corpus=bow_corpus, texts = listedDf, dictionary=dirichlet_dict, coherence='c_v', topn = num_keywords).get_coherence() for i in num_topics[:-1]]

In [15]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

In [1]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.savefig('../figures/k_parameter.pdf')
plt.show()   

NameError: name 'plt' is not defined