In [1]:
import os
import time
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

# Data preprocessing

In [2]:
data = pd.read_json(
    "../data/raw/210112 - evp_integrum.json",
    orient="records",
    convert_dates=False
)

In [3]:
data["date"] = pd.to_datetime(data["date"])
data = data.loc[data["date"].notnull() & data["fulltext"].notnull()].copy()
data = data.loc[data["date"] <= "2021-01-01"].copy()
data = data.sort_values("date").reset_index(drop=True)
data = data.assign(year=data["date"].dt.year).drop("date", 1)
data.shape

(13062, 21)

In [4]:
# I'm using sample of 10 docs because I can't run the whole corpus
np.random.seed(1)
data = data.sample(100).reset_index(drop=True)

In [5]:
#now we count the number of full texts in each years
dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
dates_count["n_paragraphs"].sum()

100

In [6]:
time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

100

# Dictionary and BOWs

In [7]:
# create a dictionary
dictionary = corpora.Dictionary(data['fulltext'].str.split())  
print(f'{len(dictionary)} tokens overall') 

56987 tokens overall


In [8]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

In [9]:
data['bows'] = data['fulltext'].str.split().apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
# print(data['bows'][0]) #what we have

# DTM model training

In [10]:
start = time.time()
model = DtmModel(
    dtm_path="../models/bin/dtm-win32.exe",
    corpus=data["bows"].values,
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

In [11]:
finish - start

1677.0011222362518

In [12]:
model.save("../models/dtm_sample.model")

---

# topic weights over time

In [13]:
model = DtmModel.load("../models/dtm_sample.model")

In [14]:
time_seq_d = dates_count.set_index(dates_count["year"])["n_paragraphs"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [15]:
topics.keys()

dict_keys([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [16]:
len(topics[2011])

100

In [17]:
topics[2011][0]

array([4.82625483e-05, 4.82625483e-05, 4.82625483e-05, 1.71586801e-01,
       5.22611085e-02, 4.82625483e-05, 4.82625483e-05, 4.82625483e-05,
       4.82625483e-05, 4.82625483e-05, 4.82625483e-05, 4.82625483e-05,
       4.82625483e-05, 4.82625483e-05, 4.82625483e-05, 4.82625483e-05,
       7.75331628e-01, 4.82625483e-05, 4.82625483e-05, 4.82625483e-05])

In [18]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [19]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year_pub
0,0.000048,0.000048,0.000048,0.171587,0.052261,0.000048,0.000048,0.000048,0.000048,0.000048,...,0.000048,0.000048,0.000048,0.000048,0.000048,0.775332,0.000048,0.000048,0.000048,2010
1,0.000002,0.001710,0.000429,0.000890,0.000002,0.002634,0.769571,0.013820,0.005218,0.017500,...,0.016004,0.001014,0.000002,0.048252,0.029377,0.000002,0.034951,0.047092,0.010518,2010
2,0.007918,0.005394,0.000008,0.000008,0.302420,0.000008,0.000008,0.000008,0.000008,0.000008,...,0.000008,0.542693,0.000008,0.086973,0.000008,0.022146,0.000008,0.000008,0.032349,2010
3,0.000053,0.060981,0.000053,0.000053,0.114130,0.000053,0.000053,0.000053,0.000053,0.022166,...,0.000053,0.000053,0.000053,0.000053,0.000053,0.000053,0.801877,0.000053,0.000053,2010
4,0.032896,0.002751,0.000005,0.040469,0.055769,0.479930,0.000005,0.000005,0.000005,0.001697,...,0.043482,0.044202,0.126990,0.063672,0.008880,0.000005,0.025197,0.000005,0.035455,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000004,0.000004,0.018528,0.023858,0.000004,0.013311,0.004325,0.050127,0.027945,0.018574,...,0.044915,0.034445,0.000004,0.649708,0.000004,0.034004,0.000326,0.000004,0.000004,2020
96,0.000006,0.027301,0.000006,0.162765,0.023639,0.000006,0.155819,0.011485,0.091825,0.233160,...,0.021139,0.006041,0.000006,0.042297,0.038262,0.000006,0.040627,0.046293,0.055777,2020
97,0.038127,0.035607,0.001195,0.010232,0.025267,0.014871,0.012045,0.013009,0.012937,0.012700,...,0.027114,0.037850,0.021245,0.025708,0.000002,0.027185,0.027677,0.002455,0.628411,2020
98,0.000010,0.007723,0.000010,0.134999,0.000010,0.000010,0.006537,0.053835,0.000010,0.000010,...,0.000010,0.000010,0.000010,0.000010,0.027748,0.310955,0.458073,0.000010,0.000010,2020


In [20]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [21]:
doc_topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,year_pub
0,0.000048,0.000048,0.000048,0.171587,0.052261,0.000048,0.000048,0.000048,0.000048,0.000048,...,0.000048,0.000048,0.000048,0.000048,0.000048,0.775332,0.000048,0.000048,0.000048,2010
1,0.000002,0.001710,0.000429,0.000890,0.000002,0.002634,0.769571,0.013820,0.005218,0.017500,...,0.016004,0.001014,0.000002,0.048252,0.029377,0.000002,0.034951,0.047092,0.010518,2010
2,0.007918,0.005394,0.000008,0.000008,0.302420,0.000008,0.000008,0.000008,0.000008,0.000008,...,0.000008,0.542693,0.000008,0.086973,0.000008,0.022146,0.000008,0.000008,0.032349,2010
3,0.000053,0.060981,0.000053,0.000053,0.114130,0.000053,0.000053,0.000053,0.000053,0.022166,...,0.000053,0.000053,0.000053,0.000053,0.000053,0.000053,0.801877,0.000053,0.000053,2010
4,0.032896,0.002751,0.000005,0.040469,0.055769,0.479930,0.000005,0.000005,0.000005,0.001697,...,0.043482,0.044202,0.126990,0.063672,0.008880,0.000005,0.025197,0.000005,0.035455,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000004,0.000004,0.018528,0.023858,0.000004,0.013311,0.004325,0.050127,0.027945,0.018574,...,0.044915,0.034445,0.000004,0.649708,0.000004,0.034004,0.000326,0.000004,0.000004,2020
96,0.000006,0.027301,0.000006,0.162765,0.023639,0.000006,0.155819,0.011485,0.091825,0.233160,...,0.021139,0.006041,0.000006,0.042297,0.038262,0.000006,0.040627,0.046293,0.055777,2020
97,0.038127,0.035607,0.001195,0.010232,0.025267,0.014871,0.012045,0.013009,0.012937,0.012700,...,0.027114,0.037850,0.021245,0.025708,0.000002,0.027185,0.027677,0.002455,0.628411,2020
98,0.000010,0.007723,0.000010,0.134999,0.000010,0.000010,0.006537,0.053835,0.000010,0.000010,...,0.000010,0.000010,0.000010,0.000010,0.027748,0.310955,0.458073,0.000010,0.000010,2020


In [22]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,4317.0,«Тополь» с экспериментальной боевой частью пор...,,,,,,,,Источник Официальный сайт Екатеринбурга ekburg...,...,4.8e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,0.775332,4.8e-05,4.8e-05,4.8e-05
1,6632.0,,Шкварун М. А.,Социально-политические науки,,,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,...,0.001012,0.016004,0.001014,2e-06,0.048252,0.029377,2e-06,0.034951,0.047092,0.010518
2,4042.0,Александр Гольц: Владимир Владимирович сердится…,,,,,,,,Источник Владимир Рыжков сайт депутата Госдум...,...,8e-06,8e-06,0.542693,8e-06,0.086973,8e-06,0.022146,8e-06,8e-06,0.032349
3,1757.0,Румынские министры иностранных дел и обороны в...,,,,,,,,Источник ТАСС Мировые новости Дата выпуска ...,...,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,5.3e-05,0.801877,5.3e-05,5.3e-05
4,1586.0,Полоса 3 .,,,,,,,,Источник Независимая газета Дата выпуска Ном...,...,0.038581,0.043482,0.044202,0.12699,0.063672,0.00888,5e-06,0.025197,5e-06,0.035455


In [23]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight
0,,Информация официальных учреждений,,Источник Официальный сайт Екатеринбурга ekburg...,,,,,2011.0,"[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,4.8e-05
1,Шкварун М. А.,UDB_EDU,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,,Социально-политические науки,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,2018.0,"[(2, 36), (11, 24), (18, 3), (24, 1), (35, 4),...",topic_0,2e-06
2,,Федеральные интернет-издания,,Источник Владимир Рыжков сайт депутата Госдум...,,,,,2012.0,"[(2, 1), (4, 1), (6, 1), (7, 1), (35, 2), (40,...",topic_0,0.007918


In [24]:
years = [*topics.keys()]

In [25]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [26]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"[(0.036, не), (0.02, на), (0.019, что), (0.019...","[(0.037, не), (0.02, на), (0.019, что), (0.018...","[(0.039, не), (0.02, на), (0.019, что), (0.017...","[(0.044, не), (0.023, что), (0.022, на), (0.01...","[(0.051, не), (0.026, что), (0.023, на), (0.02...","[(0.053, не), (0.028, что), (0.024, на), (0.02...","[(0.052, не), (0.026, что), (0.024, на), (0.01...","[(0.05, не), (0.024, что), (0.023, на), (0.019...","[(0.048, не), (0.023, на), (0.022, что), (0.02...","[(0.046, не), (0.024, но), (0.023, на), (0.02,...","[(0.045, не), (0.025, но), (0.023, на), (0.02,..."
1,"[(0.03, США), (0.014, для), (0.013, с), (0.013...","[(0.028, США), (0.014, для), (0.013, с), (0.01...","[(0.02, США), (0.014, с), (0.014, для), (0.013...","[(0.019, США), (0.013, с), (0.013, для), (0.01...","[(0.018, США), (0.013, для), (0.013, с), (0.01...","[(0.02, США), (0.012, по), (0.012, для), (0.01...","[(0.02, США), (0.014, безопасности), (0.013, п...","[(0.019, США), (0.017, безопасности), (0.014, ...","[(0.021, безопасности), (0.018, США), (0.014, ...","[(0.019, США), (0.015, безопасности), (0.013, ...","[(0.018, США), (0.015, безопасности), (0.013, ..."
2,"[(0.023, что), (0.022, не), (0.016, его), (0.0...","[(0.023, что), (0.022, не), (0.017, его), (0.0...","[(0.023, что), (0.022, не), (0.017, его), (0.0...","[(0.023, что), (0.021, не), (0.021, –), (0.018...","[(0.036, –), (0.023, что), (0.022, не), (0.018...","[(0.068, –), (0.022, не), (0.022, что), (0.018...","[(0.117, –), (0.02, не), (0.019, что), (0.016,...","[(0.157, –), (0.019, не), (0.017, что), (0.016...","[(0.145, –), (0.019, не), (0.017, что), (0.017...","[(0.152, –), (0.018, не), (0.017, его), (0.016...","[(0.122, –), (0.019, не), (0.018, его), (0.016..."
3,"[(0.059, на), (0.014, с), (0.012, года), (0.01...","[(0.051, на), (0.014, с), (0.012, В), (0.011, ...","[(0.044, на), (0.015, с), (0.012, В), (0.011, ...","[(0.036, на), (0.015, с), (0.013, В), (0.011, ...","[(0.036, на), (0.016, с), (0.013, В), (0.011, ...","[(0.034, на), (0.016, с), (0.013, В), (0.011, ...","[(0.034, на), (0.016, с), (0.012, В), (0.011, ...","[(0.034, на), (0.015, с), (0.013, по), (0.01, ...","[(0.035, на), (0.015, с), (0.014, по), (0.01, ...","[(0.033, на), (0.017, с), (0.014, по), (0.01, ...","[(0.033, на), (0.019, с), (0.014, по), (0.011,..."
4,"[(0.021, что), (0.02, на), (0.02, США), (0.012...","[(0.023, США), (0.019, на), (0.019, что), (0.0...","[(0.022, на), (0.021, США), (0.017, что), (0.0...","[(0.024, на), (0.02, США), (0.015, что), (0.01...","[(0.028, на), (0.019, США), (0.014, что), (0.0...","[(0.027, на), (0.024, США), (0.015, России), (...","[(0.029, на), (0.026, США), (0.016, России), (...","[(0.029, США), (0.026, на), (0.016, России), (...","[(0.034, США), (0.024, на), (0.019, России), (...","[(0.037, США), (0.023, на), (0.018, что), (0.0...","[(0.035, США), (0.023, на), (0.018, что), (0.0..."


In [27]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year
0,"[(0.036, не), (0.02, на), (0.019, что), (0.019...","[(0.03, США), (0.014, для), (0.013, с), (0.013...","[(0.023, что), (0.022, не), (0.016, его), (0.0...","[(0.059, на), (0.014, с), (0.012, года), (0.01...","[(0.021, что), (0.02, на), (0.02, США), (0.012...","[(0.024, –), (0.017, не), (0.017, на), (0.013,...","[(0.024, на), (0.017, с), (0.011, –), (0.01, t...","[(0.025, НАТО), (0.019, на), (0.013, –), (0.01...","[(0.03, не), (0.019, с), (0.018, что), (0.015,...","[(0.023, на), (0.014, по), (0.013, с), (0.011,...",...,"[(0.018, не), (0.017, что), (0.015, бы), (0.01...","[(0.039, не), (0.03, что), (0.016, на), (0.012...","[(0.019, на), (0.019, что), (0.014, по), (0.01...","[(0.017, на), (0.017, США), (0.016, России), (...","[(0.015, на), (0.013, В), (0.012, –), (0.011, ...","[(0.02, на), (0.013, с), (0.01, по), (0.009, В...","[(0.026, на), (0.019, с), (0.016, В), (0.015, ...","[(0.015, с), (0.013, о), (0.011, по), (0.009, ...","[(0.021, не), (0.017, что), (0.014, с), (0.013...",2010
1,"[(0.037, не), (0.02, на), (0.019, что), (0.018...","[(0.028, США), (0.014, для), (0.013, с), (0.01...","[(0.023, что), (0.022, не), (0.017, его), (0.0...","[(0.051, на), (0.014, с), (0.012, В), (0.011, ...","[(0.023, США), (0.019, на), (0.019, что), (0.0...","[(0.018, не), (0.017, –), (0.016, на), (0.013,...","[(0.023, на), (0.017, с), (0.009, –), (0.009, ...","[(0.026, НАТО), (0.02, на), (0.014, –), (0.01,...","[(0.031, не), (0.019, с), (0.018, что), (0.015...","[(0.024, на), (0.014, по), (0.013, с), (0.012,...",...,"[(0.018, не), (0.016, что), (0.016, бы), (0.01...","[(0.044, не), (0.026, что), (0.016, на), (0.01...","[(0.019, что), (0.018, на), (0.013, по), (0.01...","[(0.017, на), (0.016, России), (0.013, США), (...","[(0.015, на), (0.013, В), (0.012, РАН), (0.012...","[(0.021, на), (0.012, с), (0.011, по), (0.009,...","[(0.026, на), (0.017, с), (0.015, В), (0.015, ...","[(0.017, с), (0.011, по), (0.011, о), (0.009, ...","[(0.022, не), (0.016, что), (0.014, с), (0.013...",2011
2,"[(0.039, не), (0.02, на), (0.019, что), (0.017...","[(0.02, США), (0.014, с), (0.014, для), (0.013...","[(0.023, что), (0.022, не), (0.017, его), (0.0...","[(0.044, на), (0.015, с), (0.012, В), (0.011, ...","[(0.022, на), (0.021, США), (0.017, что), (0.0...","[(0.018, не), (0.016, на), (0.013, а), (0.012,...","[(0.023, на), (0.018, с), (0.009, В), (0.008, ...","[(0.024, НАТО), (0.021, на), (0.01, не), (0.01...","[(0.028, не), (0.019, с), (0.019, что), (0.015...","[(0.023, на), (0.015, с), (0.013, по), (0.011,...",...,"[(0.018, не), (0.017, что), (0.015, бы), (0.01...","[(0.046, не), (0.025, что), (0.016, на), (0.01...","[(0.019, что), (0.018, на), (0.013, по), (0.01...","[(0.016, на), (0.014, России), (0.009, США), (...","[(0.015, на), (0.013, В), (0.012, РАН), (0.012...","[(0.023, на), (0.014, с), (0.011, В), (0.011, ...","[(0.024, на), (0.017, с), (0.016, В), (0.015, ...","[(0.021, с), (0.011, по), (0.01, о), (0.009, н...","[(0.022, не), (0.019, что), (0.015, с), (0.013...",2012


In [28]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [29]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [30]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight,terms
0,,Информация официальных учреждений,,Источник Официальный сайт Екатеринбурга ekburg...,,,,,2011.0,"[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,4.8e-05,"[(0.037, не), (0.02, на), (0.019, что), (0.018..."
1,Шкварун М. А.,UDB_EDU,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,,Социально-политические науки,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,2018.0,"[(2, 36), (11, 24), (18, 3), (24, 1), (35, 4),...",topic_0,2e-06,"[(0.048, не), (0.023, на), (0.022, что), (0.02..."
2,,Федеральные интернет-издания,,Источник Владимир Рыжков сайт депутата Госдум...,,,,,2012.0,"[(2, 1), (4, 1), (6, 1), (7, 1), (35, 2), (40,...",topic_0,0.007918,"[(0.039, не), (0.02, на), (0.019, что), (0.017..."


In [31]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [32]:
topics_over_time = topics_over_time.loc[topics_over_time["fulltext"].notnull()].reset_index(drop=True).copy()

In [33]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [34]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [35]:
topics_over_time.to_json(
    "../data/processed/topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [36]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [37]:
topics[0]

[0.03571875269541267, 'не', 2010, 0]

In [38]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.035719,не,2010,0
1,0.019504,на,2010,0
2,0.018748,что,2010,0
3,0.018746,он,2010,0
4,0.014905,с,2010,0


In [39]:
terms_by_time.to_csv("../data/processed/terms_by_time.csv", index=False)