In [1]:
from gensim.models import AuthorTopicModel

model = AuthorTopicModel.load('model.atmodel')



In [2]:
model.show_topics(num_topics=20)

[(33,
  '0.006*"report" + 0.005*"made" + 0.003*"vocesendirecto" + 0.003*"fmi" + 0.003*"lipton" + 0.003*"weapons" + 0.003*"tech" + 0.002*"forest" + 0.002*"groundhog" + 0.002*"killing"'),
 (34,
  '0.008*"ElFinanciero_Mx Vacación" + 0.008*"Ernesto_MDiaz ManceraMiguelMX No va a decir" + 0.004*"nandorejas" + 0.004*"O sea" + 0.004*"enterado" + 0.004*"vacación" + 0.001*"CPEUM Menos" + 0.001*"cpeum" + 0.001*"JMarquezP Todo" + 0.001*"jmarquezp"'),
 (42,
  '0.089*"lomásyi" + 0.042*"notieress" + 0.014*"LoMásYI COLUMNA" + 0.010*"padremachorro" + 0.008*"México Entérate" + 0.008*"TU VOZ" + 0.008*"mazapereda" + 0.008*"libertadreligiosa" + 0.007*"siredingv" + 0.007*"COLUMNA   "'),
 (84,
  '0.007*"aclaró" + 0.005*"undato" + 0.005*"james" + 0.005*"engomado" + 0.005*"dm" + 0.005*"hoynocircula" + 0.005*"documentoíndigo" + 0.005*"‘Días de" + 0.004*"placas" + 0.004*"Opus …"'),
 (51,
  '0.052*"bbcmundo" + 0.018*"plumaje" + 0.013*"hoyenanimal" + 0.010*"eedienteanimal" + 0.007*"uanl" + 0.006*"pamelarogue" + 0.

In [None]:
aut_top = {}
for key, value in model.id2author.items():
    aut_top[value] = model.get_author_topics(value)

In [None]:
aut_top

In [3]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

Wall time: 1.33 s


In [4]:

from bokeh.io import output_file
output_file('grafica.html')

In [5]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

scale = 0.01
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [6]:
from gensim.similarities import MatrixSimilarity


index = MatrixSimilarity(model[list(model.id2author.values())])



In [7]:

from gensim import matutils
import pandas as pd


author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    sims = get_sims(model.get_author_topics(name))


    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            

    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [8]:
get_table('Pajaropolitico',top_n=136)

Unnamed: 0,Author,Score,Size
43,Pajaropolitico,1.000000,726
111,sdpnoticias,0.732948,53
108,revistaproceso,0.723680,386
31,LaRazon_mx,0.718128,624
39,NoticiasMVS,0.716460,1040
49,Reforma,0.715041,827
73,diario24horas,0.713279,1099
93,lacronicadehoy,0.713079,699
40,Notimex,0.711929,865
47,PublimetroMX,0.711838,749
