In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')

In [2]:
model.show_topics(num_topics=100)

[(0,
  '0.056*"precaución" + 0.018*"policía" + 0.018*"tierra" + 0.015*"continúa" + 0.015*"g" + 0.015*"mercados" + 0.015*"paz" + 0.012*"tropical" + 0.012*"río" + 0.012*"tormenta"'),
 (1,
  '0.008*"campechanas" + 0.005*"lerma" + 0.005*"viabilidad" + 0.004*"impuesto" + 0.004*"patrullas" + 0.004*"embarcaciones" + 0.004*"adquirieron" + 0.004*"motores" + 0.004*"colposcopia" + 0.004*"descanso"'),
 (2,
  '0.017*"sipinna" + 0.015*"cns" + 0.012*"ricardobucio" + 0.008*"conase" + 0.008*"misderechos" + 0.008*"cenapred" + 0.008*"presta" + 0.006*"sismo" + 0.006*"Plan Familiar de Protección Civil" + 0.006*"efemeridesmx"'),
 (3,
  '0.008*"apizaco" + 0.004*"deudas" + 0.004*"inmortalizará" + 0.004*"brujo" + 0.004*"espectadores" + 0.004*"brilla" + 0.004*"calpulalpan" + 0.004*"“Y “griega" + 0.004*"griega" + 0.004*"Carlos Rivera"'),
 (4,
  '0.032*"diputadospri" + 0.025*"cumpleaños" + 0.025*"feliz" + 0.023*"mejores" + 0.022*"felicidades" + 0.019*"feliz_cumpleaños" + 0.019*"envío" + 0.017*"deseos" + 0.014*"ch

In [3]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [4]:
aut_top

{'ANTERO_FER': [(14, 0.062032588357278741),
  (16, 0.039869444589735147),
  (24, 0.023803475395821439),
  (42, 0.014821630039435154),
  (43, 0.060927582941384312),
  (50, 0.14735709116278273),
  (58, 0.022689340093937697),
  (59, 0.11471002023063145),
  (60, 0.043718384415943182),
  (64, 0.017449534774454555),
  (67, 0.058050901142041769),
  (75, 0.10912862296237894),
  (81, 0.2135702241331166),
  (88, 0.020516878466767301)],
 'ARLETTEMUNOZZ': [(4, 0.037824980744994199),
  (9, 0.019997976747055231),
  (14, 0.15246561343858089),
  (16, 0.01622304155403971),
  (21, 0.14201601940726458),
  (38, 0.050522095542727477),
  (42, 0.024744446383184186),
  (43, 0.15550871792204177),
  (50, 0.019783976535073879),
  (51, 0.034419597318763956),
  (59, 0.059236445517350592),
  (60, 0.033013039622626167),
  (67, 0.059627215923621808),
  (73, 0.070194358426389394),
  (77, 0.062814737832888934),
  (86, 0.016105413871789737),
  (93, 0.036642212070690627)],
 'ATelloC': [(4, 0.039653090658899784),
  (14, 0

In [5]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

CPU times: user 1.72 s, sys: 72 ms, total: 1.79 s
Wall time: 1.7 s


In [6]:

from bokeh.io import output_file
output_file('grafica.html')

In [7]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [8]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])



In [9]:

from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [11]:
get_table('EPN',top_n=136)

Unnamed: 0,Author,Score,Size
58,EPN,1.000000,16
247,VictorSilvaMich,0.661811,40
110,JoseAMeadeK,0.658799,67
167,NallelyGtzR,0.655272,21
259,alfredodelmazo,0.648670,9
37,CarlosPuenteZAC,0.646813,49
306,ildefonsogv,0.646709,6
344,ralbores,0.644565,31
354,susanacorellap,0.642351,69
363,yulmarocha,0.638911,86
