In [20]:
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
import multiprocessing as mp
import time
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
from pyLDAvis import gensim

In [3]:
df = pd.read_parquet('/home/jgaviria/Workspace/DataScience/datasets/PC_p.parquet.gzip')

In [4]:
df.head()

Unnamed: 0,tokens
0,"[abstract, photovoltaics, zno, n-type, window,..."
1,"[abstract, image, registration, task, medical,..."
2,"[abstract, paper, consider, -algebras, equippe..."
3,"[manipulation, pre-mrna, processing, promising..."
4,"[abstract, hiv-1, infection, study, primary, c..."


In [5]:
df.shape

(7372, 1)

In [6]:
# Construir diccionario de terminos
dictionary = Dictionary(df.tokens)

In [7]:
print(dictionary)

Dictionary(77209 unique tokens: ['245', '35', 'abstract', 'aim', 'alloy']...)


In [9]:
# Construccion de matriz de documentos vs terminos
t0 = time.time()
pool = mp.Pool(mp.cpu_count()-2)
doc_term_matrix = pool.map(dictionary.doc2bow, [sentence for sentence in df.tokens])
pool.close()
print(time.time()-t0)

0.9706287384033203


In [11]:
# Construccion de modelo LDA
t0 = time.time()
lda_model = LdaMulticore(doc_term_matrix, num_topics=50, id2word=dictionary, passes=10, workers=mp.cpu_count()-2)
print(time.time()-t0)

32.19480776786804


In [12]:
# Ejemplo de visualizacion del modelo

## Mostrar los terminos y sus pesos de un documento
print(list(lda_model[doc_term_matrix[0]]))

[(10, 0.29935503), (22, 0.13266362), (34, 0.56044775)]


In [13]:
## Mostrar los terminos mas relevantes de los topicos
print(lda_model.print_topics(num_topics=10, num_words=3))

[(46, '0.007*"abstract" + 0.004*"material" + 0.004*"time"'), (0, '0.009*"patient" + 0.006*"af" + 0.005*"aortic"'), (48, '0.035*"patient" + 0.011*"study" + 0.009*"risk"'), (17, '0.016*"model" + 0.014*"abstract" + 0.010*"energy"'), (30, '0.047*"cell" + 0.011*"expression" + 0.010*"gene"'), (15, '0.007*"abstract" + 0.006*"age" + 0.005*"pulse"'), (14, '0.007*"cell" + 0.006*"abstract" + 0.004*"study"'), (25, '0.008*"mouse" + 0.007*"cell" + 0.007*"study"'), (3, '0.007*"abstract" + 0.006*"data" + 0.003*"complex"'), (34, '0.011*"protein" + 0.008*"abstract" + 0.006*"cell"')]


In [21]:
# Visualizacion de todos los topicos
t0 = time.time()
pyLDAvis.enable_notebook()
vis = gensim.prepare(lda_model, doc_term_matrix, dictionary, sort_topics=False)
print(time.time()-t0)
vis

49.44270873069763


In [22]:
pyLDAvis.save_html(vis, '/tmp/lda_visualization.html')