In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')



In [2]:
model.show_topics(num_topics=100)

[(0,
  '0.029*"tormentas" + 0.016*"qroo" + 0.011*"existía" + 0.009*"intensas" + 0.008*"torrenciales" + 0.008*"quintanaroo" + 0.008*"principal" + 0.006*"quintanarroenses" + 0.004*"pitahaya" + 0.004*"ingeniera"'),
 (1,
  '0.026*"invitarme" + 0.025*"mexico" + 0.023*"a" + 0.020*"foro" + 0.019*"comida" + 0.017*"regístrate" + 0.016*"y" + 0.012*"pte" + 0.010*"whatsapp" + 0.010*"espacio"'),
 (2,
  '0.006*"elcampoennuestrasmanos" + 0.005*"colimenses" + 0.005*"delgado" + 0.004*"extensionismo" + 0.004*"ecobici" + 0.004*"enacción" + 0.004*"agroalimentario" + 0.004*"orgullocolimense" + 0.004*"Mario Delgado" + 0.003*"congelarán"'),
 (3,
  '0.023*"estamoslistos" + 0.018*"atlixco" + 0.018*"credencialización" + 0.011*"tlalnepantla" + 0.005*"tepeolulco" + 0.005*"Prensa Nacional" + 0.005*"ixtacala" + 0.005*"Jorge Jiménez Cantú" + 0.003*"Mesa Temática Visión de Futuro" + 0.003*"debatimos"'),
 (4,
  '0.021*"elorgullodelnorte" + 0.017*"Nueva Alianza" + 0.015*"monclova" + 0.011*"próxi" + 0.010*"Coahuila #" +

In [3]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [4]:
aut_top

{'ANTERO_FER': [(66, 0.99755697307290081)],
 'ARLETTEMUNOZZ': [(42, 0.99176375847514187)],
 'ATelloC': [(68, 0.99855815617180588)],
 'A_Encinas_R': [(5, 0.055637966264266189),
  (6, 0.012013512214786132),
  (8, 0.043216004998939003),
  (14, 0.027475841587635386),
  (15, 0.045406074714323813),
  (28, 0.13134664229043358),
  (45, 0.010919078151303366),
  (46, 0.027022780989426561),
  (47, 0.12144110822945289),
  (52, 0.20923663428127887),
  (53, 0.032396566221599497),
  (68, 0.02309769743330168),
  (71, 0.057300910153746568),
  (85, 0.025719297586720112),
  (97, 0.16117671248423432)],
 'AaronIrizar': [(79, 0.99253585054538718)],
 'AccionNacional': [(71, 0.99774927381980982)],
 'AdriElizarraraz': [(85, 0.9951828140078709)],
 'AdrianaDavilaF': [(50, 0.985618315776986)],
 'AispuroDurango': [(8, 0.99893502276490098)],
 'AleGutierrez_mx': [(1, 0.99412395085099192)],
 'AlejandroGonMu': [(40, 0.99866121380549555)],
 'AlvarezMaynez': [(97, 0.99969252629597782)],
 'AlyGamboa': [(30, 0.99672485377

In [5]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

Wall time: 2.05 s


In [6]:

from bokeh.io import output_file
output_file('grafica.html')

In [7]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [8]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])



In [9]:

from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [11]:
get_table('EPN',top_n=136)

Unnamed: 0,Author,Score,Size
58,EPN,1.000000,16
306,ildefonsogv,0.733251,8
110,JoseAMeadeK,0.730281,79
363,yulmarocha,0.710691,94
354,susanacorellap,0.702413,72
37,CarlosPuenteZAC,0.686700,54
104,JesusCasillas06,0.670347,61
23,Azuletcheverry,0.668421,52
55,DivaGastelum,0.665123,100
151,MariaGloriaHM,0.654396,77
