In [1]:
import os
import time
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

# Data preprocessing

In [2]:
data = pd.read_json(
    "../data/raw/210112 - evp_integrum.json",
    orient="records",
    convert_dates=False
)

In [3]:
data["date"] = pd.to_datetime(data["date"])
data = data.loc[data["date"].notnull() & data["fulltext"].notnull()].copy()
data = data.loc[data["date"] <= "2021-01-01"].copy()
data = data.sort_values("date").reset_index(drop=True)
data = data.assign(year=data["date"].dt.year).drop("date", 1)
data.shape

(13062, 21)

In [4]:
# I'm using sample of 10 docs because I can't run the whole corpus
np.random.seed(1)
data = data.sample(10).reset_index(drop=True)

In [5]:
#now we count the number of full texts in each years
dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
dates_count["n_paragraphs"].sum()

10

In [6]:
time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

10

# Dictionary and BOWs

In [7]:
# create a dictionary
dictionary = corpora.Dictionary(data['fulltext'].str.split())  
print(f'{len(dictionary)} tokens overall') 

7233 tokens overall


In [8]:
data['bows'] = data['fulltext'].str.split().apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
# print(data['bows'][0]) #what we have

In [9]:
data.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,article_title,source_site,rubric,place,size,database,doi,FILE_SOURCE,year,bows
0,4317,«Тополь» с экспериментальной боевой частью пор...,,,,,,,,Источник Официальный сайт Екатеринбурга ekburg...,...,,Официальный сайт Екатеринбурга (ekburg.ru),,,,Информация официальных учреждений,,INTEGRUM,2011,"[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1..."
1,6632,,Шкварун М. А.,Социально-политические науки,,,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,...,Социально-политические науки,,,,,UDB_EDU,,EVP,2018,"[(4, 36), (16, 24), (24, 3), (31, 1), (43, 4),..."
2,4042,Александр Гольц: Владимир Владимирович сердится…,,,,,,,,Источник Владимир Рыжков сайт депутата Госдум...,...,,Владимир Рыжков - сайт депутата Госдумы (ryzko...,,,,Федеральные интернет-издания,,INTEGRUM,2012,"[(4, 1), (6, 1), (9, 1), (11, 1), (43, 2), (45..."
3,1757,Румынские министры иностранных дел и обороны в...,,,,,,,,Источник ТАСС Мировые новости Дата выпуска ...,...,,ТАСС - Мировые новости (архив - 2019),,,,Федеральные информагентства,,INTEGRUM,2016,"[(4, 3), (6, 1), (9, 1), (11, 1), (44, 1), (45..."
4,1586,Полоса 3 .,,,,,,,,Источник Независимая газета Дата выпуска Ном...,...,,Независимая газета (PDF-версия),,,,Федеральная пресса,,INTEGRUM,2014,"[(4, 17), (6, 1), (9, 1), (11, 1), (24, 2), (3..."


# DTM model training

In [10]:
start = time.time()
model = DtmModel(
    dtm_path="../models/bin/dtm-win32.exe",
    corpus=data["bows"],
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

In [11]:
finish - start

In [12]:
model.save("../models/dtm_sample.model")

---

# topic weights over time

In [13]:
model = DtmModel.load("../models/dtm_sample.model")

In [14]:
time_seq_d = dates_count.set_index(dates_count["year"])["n_paragraphs"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [15]:
topics.keys()

dict_keys([2011, 2012, 2013, 2014, 2016, 2018, 2019, 2020])

In [16]:
len(topics[2011])

10

In [17]:
topics[2011][0]

array([4.16319734e-05, 4.16319734e-05, 4.16319734e-05, 4.16319734e-05,
       4.16319734e-05, 4.16319734e-05, 4.16319734e-05, 4.16319734e-05,
       4.16319734e-05, 1.11893943e-01, 4.16319734e-05, 4.16319734e-05,
       4.16319734e-05, 4.16319734e-05, 6.09015182e-01, 4.16319734e-05,
       1.80074424e-01, 9.83503407e-02, 4.16319734e-05, 4.16319734e-05])

In [18]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [19]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year_pub
0,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.111894,...,0.000042,0.000042,0.000042,0.609015,0.000042,0.180074,0.098350,0.000042,0.000042,2011
1,0.023420,0.016873,0.007826,0.028084,0.023420,0.017563,0.023420,0.013384,0.013011,0.009268,...,0.007865,0.023420,0.019625,0.009839,0.028670,0.066194,0.047022,0.023420,0.590917,2011
2,0.000007,0.000007,0.000007,0.016136,0.000007,0.000007,0.000007,0.754111,0.000007,0.028970,...,0.000007,0.000007,0.000007,0.000007,0.068599,0.093438,0.006113,0.000007,0.000007,2011
3,0.000040,0.000040,0.617341,0.000040,0.000040,0.000040,0.000040,0.000040,0.009005,0.000040,...,0.000040,0.000040,0.000040,0.008281,0.000040,0.000040,0.315285,0.000040,0.000040,2011
4,0.018828,0.010156,0.011769,0.013123,0.018828,0.015030,0.018828,0.060929,0.003780,0.040497,...,0.020616,0.018828,0.025244,0.004867,0.560443,0.070086,0.053207,0.018828,0.003750,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0.000008,0.005390,0.045475,0.023700,0.000008,0.365472,0.000008,0.035167,0.005433,0.064751,...,0.000008,0.000008,0.000008,0.000008,0.121453,0.117423,0.123841,0.000008,0.028750,2020
6,0.000006,0.015011,0.003427,0.000006,0.000006,0.000006,0.000006,0.035803,0.034683,0.011235,...,0.769021,0.000006,0.000006,0.000006,0.069443,0.014580,0.014905,0.000006,0.031837,2020
7,0.000017,0.000017,0.000017,0.000017,0.000017,0.018066,0.000017,0.050827,0.000017,0.751726,...,0.000017,0.000017,0.000017,0.000017,0.091805,0.067039,0.000017,0.000017,0.000017,2020
8,0.000007,0.031336,0.000007,0.509075,0.000007,0.010050,0.000007,0.023093,0.020153,0.021618,...,0.048350,0.000007,0.003787,0.000007,0.027118,0.141133,0.078234,0.000007,0.079093,2020


In [20]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [21]:
doc_topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,year_pub
0,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.111894,...,0.000042,0.000042,0.000042,0.609015,0.000042,0.180074,0.098350,0.000042,0.000042,2011
1,0.023420,0.016873,0.007826,0.028084,0.023420,0.017563,0.023420,0.013384,0.013011,0.009268,...,0.007865,0.023420,0.019625,0.009839,0.028670,0.066194,0.047022,0.023420,0.590917,2011
2,0.000007,0.000007,0.000007,0.016136,0.000007,0.000007,0.000007,0.754111,0.000007,0.028970,...,0.000007,0.000007,0.000007,0.000007,0.068599,0.093438,0.006113,0.000007,0.000007,2011
3,0.000040,0.000040,0.617341,0.000040,0.000040,0.000040,0.000040,0.000040,0.009005,0.000040,...,0.000040,0.000040,0.000040,0.008281,0.000040,0.000040,0.315285,0.000040,0.000040,2011
4,0.018828,0.010156,0.011769,0.013123,0.018828,0.015030,0.018828,0.060929,0.003780,0.040497,...,0.020616,0.018828,0.025244,0.004867,0.560443,0.070086,0.053207,0.018828,0.003750,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0.000008,0.005390,0.045475,0.023700,0.000008,0.365472,0.000008,0.035167,0.005433,0.064751,...,0.000008,0.000008,0.000008,0.000008,0.121453,0.117423,0.123841,0.000008,0.028750,2020
6,0.000006,0.015011,0.003427,0.000006,0.000006,0.000006,0.000006,0.035803,0.034683,0.011235,...,0.769021,0.000006,0.000006,0.000006,0.069443,0.014580,0.014905,0.000006,0.031837,2020
7,0.000017,0.000017,0.000017,0.000017,0.000017,0.018066,0.000017,0.050827,0.000017,0.751726,...,0.000017,0.000017,0.000017,0.000017,0.091805,0.067039,0.000017,0.000017,0.000017,2020
8,0.000007,0.031336,0.000007,0.509075,0.000007,0.010050,0.000007,0.023093,0.020153,0.021618,...,0.048350,0.000007,0.003787,0.000007,0.027118,0.141133,0.078234,0.000007,0.079093,2020


In [22]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,4317.0,«Тополь» с экспериментальной боевой частью пор...,,,,,,,,Источник Официальный сайт Екатеринбурга ekburg...,...,4.2e-05,4.2e-05,4.2e-05,4.2e-05,0.609015,4.2e-05,0.180074,0.09835,4.2e-05,4.2e-05
1,6632.0,,Шкварун М. А.,Социально-политические науки,,,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,...,0.006761,0.007865,0.02342,0.019625,0.009839,0.02867,0.066194,0.047022,0.02342,0.590917
2,4042.0,Александр Гольц: Владимир Владимирович сердится…,,,,,,,,Источник Владимир Рыжков сайт депутата Госдум...,...,0.032545,7e-06,7e-06,7e-06,7e-06,0.068599,0.093438,0.006113,7e-06,7e-06
3,1757.0,Румынские министры иностранных дел и обороны в...,,,,,,,,Источник ТАСС Мировые новости Дата выпуска ...,...,0.049491,4e-05,4e-05,4e-05,0.008281,4e-05,4e-05,0.315285,4e-05,4e-05
4,1586.0,Полоса 3 .,,,,,,,,Источник Независимая газета Дата выпуска Ном...,...,0.012363,0.020616,0.018828,0.025244,0.004867,0.560443,0.070086,0.053207,0.018828,0.00375


In [23]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight
0,,Информация официальных учреждений,,Источник Официальный сайт Екатеринбурга ekburg...,,,,,2011.0,"[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1...",topic_0,4.2e-05
1,Шкварун М. А.,UDB_EDU,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,,Социально-политические науки,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,2018.0,"[(4, 36), (16, 24), (24, 3), (31, 1), (43, 4),...",topic_0,0.02342
2,,Федеральные интернет-издания,,Источник Владимир Рыжков сайт депутата Госдум...,,,,,2012.0,"[(4, 1), (6, 1), (9, 1), (11, 1), (43, 2), (45...",topic_0,7e-06


In [26]:
years = [*topics.keys()]

In [27]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [28]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,2011,2012,2013,2014,2016,2018,2019,2020
0,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С..."
1,"[(0.035, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.035, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.04, мира), (0.0, Поднебесной), (0.0, Сущес...","[(0.045, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.05, мира), (0.0, Поднебесной), (0.0, Сущес...","[(0.057, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.06, мира), (0.0, Поднебесной), (0.0, Сущес...","[(0.061, мира), (0.0, Поднебесной), (0.0, Суще..."
2,"[(0.047, НАТО), (0.044, в), (0.033, и), (0.028...","[(0.047, НАТО), (0.044, в), (0.034, и), (0.029...","[(0.045, НАТО), (0.044, в), (0.034, и), (0.03,...","[(0.044, в), (0.043, НАТО), (0.034, и), (0.032...","[(0.043, в), (0.041, НАТО), (0.04, безопасност...","[(0.044, безопасности), (0.043, в), (0.04, НАТ...","[(0.045, безопасности), (0.043, в), (0.04, НАТ...","[(0.047, безопасности), (0.043, в), (0.04, НАТ..."
3,"[(0.049, и), (0.031, в), (0.012, По), (0.011, ...","[(0.049, и), (0.031, в), (0.013, По), (0.011, ...","[(0.05, и), (0.031, в), (0.012, По), (0.011, с...","[(0.05, и), (0.032, в), (0.012, По), (0.011, с...","[(0.051, и), (0.032, в), (0.012, По), (0.011, ...","[(0.052, и), (0.032, в), (0.012, терроризма), ...","[(0.052, и), (0.032, в), (0.013, терроризма), ...","[(0.052, и), (0.032, в), (0.013, терроризма), ..."
4,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С..."


In [29]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year
0,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.035, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.047, НАТО), (0.044, в), (0.033, и), (0.028...","[(0.049, и), (0.031, в), (0.012, По), (0.011, ...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.042, и), (0.036, США), (0.018, для), (0.01...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.032, в), (0.028, что), (0.022, не), (0.014...","[(0.058, без), (0.055, страны), (0.028, место)...","[(0.048, в), (0.028, этом), (0.021, и), (0.018...",...,"[(0.049, и), (0.034, в), (0.016, о), (0.01, по...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.028, было), (0.0, Политики), (0.0, Существ...","[(0.031, была), (0.025, с), (0.021, этого), (0...","[(0.04, и), (0.037, в), (0.02, на), (0.019, чт...","[(0.04, в), (0.035, и), (0.015, на), (0.015, г...","[(0.069, и), (0.067, в), (0.043, на), (0.023, ...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.053, в), (0.042, и), (0.021, на), (0.015, ...",2011
1,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.035, мира), (0.0, Поднебесной), (0.0, Суще...","[(0.047, НАТО), (0.044, в), (0.034, и), (0.029...","[(0.049, и), (0.031, в), (0.013, По), (0.011, ...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.042, и), (0.036, США), (0.018, для), (0.01...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.032, в), (0.028, что), (0.022, не), (0.014...","[(0.059, без), (0.055, страны), (0.028, место)...","[(0.047, в), (0.028, этом), (0.021, и), (0.018...",...,"[(0.049, и), (0.034, в), (0.016, о), (0.01, по...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.028, было), (0.0, Политики), (0.0, Существ...","[(0.036, была), (0.025, с), (0.022, этого), (0...","[(0.041, и), (0.037, в), (0.02, на), (0.019, ч...","[(0.039, в), (0.035, и), (0.015, на), (0.014, ...","[(0.069, и), (0.065, в), (0.043, на), (0.023, ...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.054, в), (0.042, и), (0.021, на), (0.015, ...",2012
2,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.04, мира), (0.0, Поднебесной), (0.0, Сущес...","[(0.045, НАТО), (0.044, в), (0.034, и), (0.03,...","[(0.05, и), (0.031, в), (0.012, По), (0.011, с...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.042, и), (0.036, США), (0.018, для), (0.01...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.032, в), (0.028, что), (0.023, не), (0.015...","[(0.061, без), (0.056, страны), (0.03, место),...","[(0.047, в), (0.025, этом), (0.021, и), (0.019...",...,"[(0.049, и), (0.034, в), (0.016, о), (0.01, по...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.03, было), (0.0, Политики), (0.0, Существу...","[(0.039, была), (0.025, с), (0.023, этого), (0...","[(0.041, и), (0.037, в), (0.02, на), (0.019, ч...","[(0.039, в), (0.035, и), (0.015, на), (0.015, ...","[(0.07, и), (0.066, в), (0.044, на), (0.024, п...","[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С...","[(0.055, в), (0.043, и), (0.021, на), (0.015, ...",2013


In [30]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [31]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [32]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight,terms
0,,Информация официальных учреждений,,Источник Официальный сайт Екатеринбурга ekburg...,,,,,2011.0,"[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1...",topic_0,4.2e-05,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С..."
1,Шкварун М. А.,UDB_EDU,,Шкварун М А ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН БЛИЖНЕ...,,Социально-политические науки,ПОЛИТИЧЕСКИЕ ПРОБЛЕМЫ СТРАН\r\nБЛИЖНЕВОСТОЧНОГ...,,2018.0,"[(4, 36), (16, 24), (24, 3), (31, 1), (43, 4),...",topic_0,0.02342,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С..."
2,,Федеральные интернет-издания,,Источник Владимир Рыжков сайт депутата Госдум...,,,,,2012.0,"[(4, 1), (6, 1), (9, 1), (11, 1), (43, 2), (45...",topic_0,7e-06,"[(0.0, Поднебесной), (0.0, Пентагоне), (0.0, С..."


In [33]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [34]:
topics_over_time = topics_over_time.loc[topics_over_time["fulltext"].notnull()].reset_index(drop=True).copy()

In [36]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [37]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [38]:
topics_over_time.to_json(
    "../data/processed/topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [39]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [40]:
topics[0]

[0.00013825521913452235, 'Поднебесной', 2011, 0]

In [41]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.000138,Поднебесной,2011,0
1,0.000138,Пентагоне,2011,0
2,0.000138,Стратегию,2011,0
3,0.000138,Сегодня,2011,0
4,0.000138,Самую,2011,0


In [42]:
terms_by_time.to_csv("../data/processed/terms_by_time.csv", index=False)