In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')

In [2]:
model.show_topics(num_topics=100)

[(0,
  '0.016*"sección" + 0.014*"pepe" + 0.014*"radiactivo" + 0.011*"gilberto" + 0.010*"ip" + 0.010*"grillo" + 0.008*"portilla" + 0.008*"león_portilla" + 0.008*"lenguas" + 0.008*"traducir"'),
 (1,
  '0.000*"conducen" + 0.000*"recibes" + 0.000*"achiquini" + 0.000*"mascotas" + 0.000*"vacunar" + 0.000*"agencias" + 0.000*"pañales" + 0.000*"tania" + 0.000*"martibatres" + 0.000*"automtico"'),
 (2,
  '0.143*"lomsyi" + 0.067*"notieress" + 0.022*"LoMsYI COLUMNA" + 0.016*"padremachorro" + 0.013*"libertadreligiosa" + 0.013*"Mxico Entrate" + 0.013*"TU VOZ" + 0.013*"mazapereda" + 0.011*"COLUMNA   " + 0.011*"LoMsYI CRISIS"'),
 (3,
  '0.055*"ms_leaks" + 0.030*"PRI Asamblea" + 0.020*"serpienyescaler" + 0.018*"presidenciables" + 0.016*"jmcartoon" + 0.012*"dictaminar" + 0.012*"visióndefuturo" + 0.012*"candado" + 0.009*"Ms leaks:" + 0.008*"pontifica"'),
 (4,
  '0.024*"arnemx" + 0.018*"monicamateosv" + 0.012*"nimio" + 0.012*"esrfmx" + 0.012*"endilguemos" + 0.011*"gobernante" + 0.008*"amplias" + 0.008*"con

In [3]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [4]:
aut_top

{'ADNPolitico': [(7, 0.1542607438243552),
  (33, 0.033540125669744167),
  (35, 0.14533255777544643),
  (38, 0.16859404786902896),
  (41, 0.15468233766521852),
  (49, 0.083716288519736892),
  (97, 0.25515041426746332)],
 'ActualidadRT': [(7, 0.11080162819908797),
  (33, 0.096946560779406346),
  (35, 0.18907856757464397),
  (38, 0.037500571735262334),
  (49, 0.020446242909530267),
  (94, 0.28699248942392264),
  (97, 0.25755704121376799)],
 'Adela_Micha': [(27, 0.34995192278475434),
  (33, 0.14357488320089262),
  (35, 0.16000733319165841),
  (38, 0.047825471006611213),
  (49, 0.21186936610281701),
  (97, 0.082278960598233211)],
 'Amsalazar': [(7, 0.13681675320173425),
  (8, 0.061917423833204893),
  (24, 0.010700341514218016),
  (33, 0.028421630797237345),
  (35, 0.051523491221393977),
  (38, 0.17201701003197681),
  (44, 0.046439955338640003),
  (49, 0.32875691556351078),
  (96, 0.077275640135725965),
  (97, 0.074142529401211008)],
 'AnaPOrdorica': [(7, 0.12306570560569173),
  (33, 0.02194

In [5]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

CPU times: user 972 ms, sys: 40 ms, total: 1.01 s
Wall time: 836 ms


In [6]:

from bokeh.io import output_file
output_file('grafica.html')

In [7]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [8]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])



In [9]:

from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [10]:
get_table('Pajaropolitico',top_n=136)

Unnamed: 0,Author,Score,Size
43,Pajaropolitico,1.000000,726
111,sdpnoticias,0.776769,53
77,elsolde_mexico,0.723255,721
93,lacronicadehoy,0.719705,699
96,lopezdoriga,0.719275,658
6,AztecaNoticias,0.718762,197
35,MarioBeteta,0.717402,439
39,NoticiasMVS,0.714790,1040
108,revistaproceso,0.712646,386
49,Reforma,0.704470,827
