In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')



In [2]:
model.show_topics(num_topics=100)

[(0,
  '0.195*"laprimeramvs" + 0.047*"alaire_laprimeramvs" + 0.008*"Adrián Franco Barrios" + 0.006*"@JoseRamonMartel FM" + 0.006*"LaPrimeraMVS!" + 0.006*"Ccp @PGR_mx" + 0.006*"primerasplanas" + 0.005*"generacional" + 0.004*"micheel" + 0.004*"Karla Micheel Salas"'),
 (1,
  '0.005*"madrazo" + 0.002*"intervalos" + 0.001*"Luis Madrazo Lajous" + 0.001*"provincia de #Moscú" + 0.001*"Luis Madrazo |" + 0.001*"desgargue" + 0.001*"inter" + 0.001*"lajous" + 0.001*"Policías Federales" + 0.001*"incertidumbres"'),
 (2,
  '0.017*"redesdepodermty" + 0.014*"redesdepoder" + 0.009*"recuerden" + 0.007*"salma" + 0.007*"wixárika" + 0.006*"salidadeemergencia" + 0.004*"reencarna" + 0.004*"@antonio_navalon | Tarde" + 0.004*"Asamblea del @PRI_Nacional" + 0.004*"@ADOMINGUEZMURO | El @Cruz_Azul_FC"'),
 (3,
  '0.025*"encontexto" + 0.024*"paraescuchar" + 0.010*"vuelvealeer" + 0.008*"ojoaldato" + 0.005*"currículums" + 0.004*"poli" + 0.004*"package" + 0.004*"limón" + 0.004*"ParaEscuchar Que" + 0.004*"#EnContexto"'),


In [3]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [4]:
aut_top

{'ADNPolitico': [(32, 0.99969572139918883)],
 'ActualidadRT': [(55, 0.9998982331880929)],
 'Adela_Micha': [(75, 0.99574569917872024)],
 'Amsalazar': [(98, 0.99808341799041)],
 'AnaPOrdorica': [(85, 0.99744023935132309)],
 'AristeguiOnline': [(20, 0.19534426329804927),
  (23, 0.057448561431974479),
  (32, 0.11088119080779231),
  (51, 0.31529625136611228),
  (55, 0.25160461631955078),
  (75, 0.016212010723002546),
  (85, 0.047834792396275541)],
 'AztecaNoticias': [(20, 0.075679808272152746),
  (23, 0.10406778581977091),
  (32, 0.20803267411687826),
  (33, 0.015311983996010133),
  (51, 0.03160429383773939),
  (55, 0.28598953034326147),
  (73, 0.054625396162468032),
  (85, 0.2215334535251216)],
 'Canal22': [(20, 0.044057083069585883),
  (23, 0.05120894192075872),
  (32, 0.04075153766906775),
  (33, 0.023163478228257157),
  (45, 0.38283715833381127),
  (51, 0.088033492958399628),
  (55, 0.21388553642322272),
  (65, 0.13069856404049141),
  (75, 0.02039594279015711)],
 'CanalOnceTV': [(20, 0.

In [5]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

Wall time: 947 ms


In [6]:

from bokeh.io import output_file
output_file('grafica.html')

In [7]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [8]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])



In [9]:

from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [10]:
get_table('Pajaropolitico',top_n=136)

Unnamed: 0,Author,Score,Size
43,Pajaropolitico,1.000000,884
38,NTelevisa_com,0.755383,1746
111,sdpnoticias,0.751607,71
73,diario24horas,0.750833,1429
39,NoticiasMVS,0.749927,1476
101,mileniotv,0.749827,138
47,PublimetroMX,0.749548,999
36,MexicanTimes,0.744413,456
108,revistaproceso,0.743817,557
95,lasillarota,0.743509,1203


In [11]:
import pickle
dictionary = pickle.load(open("dictionary.p", "rb"))

In [12]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [13]:
pyLDAvis.gensim.prepare(model, model.corpus,dictionary)

TypeError: inference() missing 3 required positional arguments: 'author2doc', 'doc2author', and 'rhot'