# LDA with gensim

In [53]:
import json

In [60]:
# Generated by mass2lda
lda_dict = json.load(open('METABOLOMICS-SNETS-e9eaf6ec-download_clustered_spectra-main.lda.json'))
corpus = json.load(open('METABOLOMICS-SNETS-e9eaf6ec-download_clustered_spectra-main.corpus.json'))

In [61]:
K = 300
n_its = 1000

## Preprocessing on corpus

Normalize intensities within single document/spectra.
Where max intensity is scaled to 1000.

## Postprocessing on topics

Apply thresholds:
* min_prob_to_keep_beta = 1e-3,
* min_prob_to_keep_phi = 1e-2,
* min_prob_to_keep_theta = 1e-2,

In [62]:
m = []
index2doc = []
normalize = 1000
for doc, words in corpus.items():
    bow = []
    max_score = max(words.values())
    for word, score in words.items():
        bow.append((lda_dict['word_index'][word], int(score * 1000 / max_score)))
    m.append(bow)
    index2doc.append(doc)

In [66]:
from gensim.models.ldamulticore import LdaMulticore

In [70]:
%time lda = LdaMulticore(m, num_topics=K, iterations=n_its, per_word_topics=True)

CPU times: user 5.06 s, sys: 939 ms, total: 6 s
Wall time: 6.15 s


In [71]:
lda.get_topic_terms(3)

[(299, 0.31341827),
 (298, 0.19263276),
 (157, 0.15175839),
 (77, 0.1469286),
 (155, 0.055923313),
 (61, 0.053085882),
 (154, 0.03253929),
 (153, 0.031126818),
 (156, 0.021202547),
 (364, 2.5776837e-06)]

Use  Hierachical Dirichlet Process model to determine number of topics in corpus

In [8]:
from gensim.models import HdpModel

In [9]:
hdp = HdpModel(m, {v:k for k,v in lda_dict['word_index'].items()})

In [12]:
hdp.suggested_lda_model().num_topics

150

In [21]:
lda.get_topic_terms(200)

[(299, 0.5258681),
 (298, 0.47208077),
 (368, 3.7705206e-06),
 (363, 3.7705206e-06),
 (364, 3.7705206e-06),
 (365, 3.7705206e-06),
 (366, 3.7705206e-06),
 (361, 3.7705206e-06),
 (367, 3.7705206e-06),
 (371, 3.7705206e-06)]

In [23]:
index_word = {v:k for k,v in lda_dict['word_index'].items()}

In [26]:
(index_word[299], index_word[298],)

(u'loss_178.0475', u'fragment_81.0425')

In [27]:
lda.get_topic_terms(100)

[(116, 0.31946492),
 (118, 0.31337187),
 (117, 0.3130959),
 (18, 0.053458903),
 (367, 1.1222256e-06),
 (364, 1.1222256e-06),
 (365, 1.1222256e-06),
 (366, 1.1222256e-06),
 (369, 1.1222256e-06),
 (368, 1.1222256e-06)]

In [28]:
(index_word[116], index_word[118],index_word[117],index_word[18],)

(u'fragment_69.0675',
 u'loss_176.0975',
 u'fragment_111.1175',
 u'fragment_55.0525')

In [30]:
lda.get_topics()

array([[1.8315017e-03, 1.8315017e-03, 1.8315017e-03, ..., 1.8315017e-03,
        1.8315017e-03, 1.8315017e-03],
       [1.8315017e-03, 1.8315017e-03, 1.8315017e-03, ..., 1.8315017e-03,
        1.8315017e-03, 1.8315017e-03],
       [1.8315017e-03, 1.8315017e-03, 1.8315017e-03, ..., 1.8315017e-03,
        1.8315017e-03, 1.8315017e-03],
       ...,
       [5.7636848e-06, 5.7636848e-06, 5.7636848e-06, ..., 5.7636848e-06,
        5.7636848e-06, 5.7636848e-06],
       [1.8315017e-03, 1.8315017e-03, 1.8315017e-03, ..., 1.8315017e-03,
        1.8315017e-03, 1.8315017e-03],
       [1.2531301e-06, 7.4958928e-02, 1.2531301e-06, ..., 1.2531301e-06,
        1.2531301e-06, 1.2531301e-06]], dtype=float32)

### Beta

In [81]:
min_prob_to_keep_beta = 1e-3

In [203]:
import numpy as np
beta = {}
doc2index = {v:k for k,v in enumerate(index2doc)}
index2word = {v:k for k,v in lda_dict['word_index'].items()}
for tid, topic in enumerate(lda.get_topics()):
    topic = topic / topic.sum() # normalize to probability distribution
    beta['motif_{0}'.format(tid)] = {index2word[idx]: float(topic[idx]) for idx in np.argsort(-topic) if topic[idx] > min_prob_to_keep_beta}

In [204]:
beta['motif_0']

{u'fragment_109.0675': 0.015312251634895802,
 u'fragment_126.0875': 0.009134874679148197,
 u'fragment_138.0725': 0.7125864028930664,
 u'fragment_144.1025': 0.043076205998659134,
 u'fragment_70.0675': 0.01475980132818222,
 u'fragment_79.0525': 0.009867794811725616,
 u'fragment_81.0725': 0.028436455875635147,
 u'fragment_98.0775': 0.05109007656574249,
 u'loss_17.9975': 0.010833781212568283,
 u'loss_35.0225': 0.009593602269887924,
 u'loss_46.0075': 0.030735855922102928,
 u'loss_63.0175': 0.027180343866348267,
 u'loss_65.0325': 0.010106142610311508,
 u'loss_74.0225': 0.026022739708423615}

### Theta

In [125]:
min_prob_to_keep_theta = 1e-2

In [208]:
theta = {}
for doc_id, bow in enumerate(m):
    topics = lda.get_document_topics(bow, minimum_probability=min_prob_to_keep_theta)
    theta[index2doc[doc_id]] = {'motif_{0}'.format(topic_id): float(prob) for topic_id, prob in topics}

In [209]:
theta['document_252']

{'motif_135': 0.9997212290763855}

### Phi

In [210]:
min_prob_to_keep_phi = 1e-2

In [211]:
_, _, topics_per_word_phi = lda.get_document_topics(m[1], per_word_topics=True, 
                        minimum_probability=min_prob_to_keep_theta, 
                        minimum_phi_value=min_prob_to_keep_phi)

In [212]:
phi = {}
for doc_id, bow in enumerate(m):
    _, _, topics_per_word_phi = lda.get_document_topics(bow, per_word_topics=True, 
                        minimum_probability=min_prob_to_keep_theta, 
                        minimum_phi_value=min_prob_to_keep_phi)
    phi[index2doc[doc_id]] = {index2word[word_id]: {'motif_{0}'.format(topic_id): phi/1000 for topic_id, phi in topics} for word_id, topics in topics_per_word_phi}

In [213]:
phi['document_252']

{u'fragment_123.9625': {'motif_135': 0.787},
 u'fragment_64.9275': {'motif_135': 0.9999999389648437},
 u'loss_23.0175': {'motif_135': 0.7870000610351563},
 u'loss_82.0525': {'motif_135': 1.0}}

In [169]:
phi['document_1']

{u'fragment_86.0625': {'motif_247': 0.5674258422851562,
  'motif_34': 0.43257421875},
 u'loss_60.0175': {'motif_247': 1.0}}

## Construct lda_dict from gensim result

In [214]:
lda_dict_gensim = {}
lda_dict_gensim['corpus'] = corpus
lda_dict_gensim['word_index'] = lda_dict['word_index']
lda_dict_gensim['doc_index'] = lda_dict['doc_index']
lda_dict_gensim['K'] = K
lda_dict_gensim['alpha'] = [float(d) for d in lda.alpha]
lda_dict_gensim['doc_metadata'] = lda_dict['doc_metadata']
lda_dict_gensim['topic_index'] = lda_dict['topic_index']
lda_dict_gensim['topic_metadata'] = lda_dict['topic_metadata']
lda_dict_gensim['features'] = lda_dict['features']
lda_dict_gensim['beta'] = beta
lda_dict_gensim['theta'] = theta
lda_dict_gensim['phi'] = phi
# Missing gamma key, but not used in storing lda_dict into db

In [215]:
sorted(lda_dict_gensim.keys())

['K',
 'alpha',
 'beta',
 'corpus',
 'doc_index',
 'doc_metadata',
 'features',
 'phi',
 'theta',
 'topic_index',
 'topic_metadata',
 'word_index']

In [216]:
with open('METABOLOMICS-SNETS-e9eaf6ec-download_clustered_spectra-main.jensim.lda.json', 'w') as f:
    json.dump(lda_dict_gensim, f)