In [1]:
import os
import time
import logging
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

In [2]:
logging.basicConfig(level=logging.DEBUG)

# Data preprocessing

In [3]:
data = pd.read_csv("./../data/processed/210202 - evp_integrum_ngrams.csv", converters={'fulltext': eval})

In [4]:
data.shape

(11836, 24)

In [5]:
data["year"] = data["year"].astype(int)

In [6]:
data["date"] = pd.to_datetime(data["date"])
data = data.loc[data["date"].notnull() & data["fulltext"].notnull()].copy()
data = data.loc[data["date"] <= "2021-01-01"].copy()
data = data.sort_values("date").reset_index(drop=True)
data = data.assign(year=data["date"].dt.year).drop("date", 1)
data.shape

(11834, 23)

In [7]:
# # testing
# np.random.seed(1)
# data = data.sample(100).reset_index(drop=True)

In [8]:
dates_cnt = data['year'].value_counts().rename_axis('year').reset_index(name='excerpt_count')
dates_cnt.sort_values('year', inplace = True, ascending = True)
time_seq = dates_cnt['excerpt_count'].to_list()

In [9]:
# #now we count the number of full texts in each years
# dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
# dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
# dates_count["n_paragraphs"].sum()

In [10]:
# time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

11834

# Dictionary and BOWs

In [11]:
# create a dictionary
dictionary = corpora.Dictionary(data['fulltext'])  
print(f'{len(dictionary)} tokens overall') 

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(57704 unique tokens: ['глава', 'достижение', 'запугивание', 'империалист', 'карибский']...)
INFO:gensim.corpora.dictionary:built Dictionary(64835 unique tokens: ['глава', 'достижение', 'запугивание', 'империалист', 'карибский']...) from 11834 documents (total 1504794 corpus positions)


64835 tokens overall


In [12]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

INFO:gensim.corpora.dictionary:discarding 23551 tokens: [('катастрофичность', 1), ('империаль', 1), ('стический', 1), ('воеын', 1), ('канск', 1), ('перенестись', 1), ('байбак', 1), ('антимилитаристический', 1), ('бом', 1), ('внепартийность', 1)]...
INFO:gensim.corpora.dictionary:keeping 41284 tokens which were in no less than 2 and no more than 11715 (=99.0%) documents
DEBUG:gensim.corpora.dictionary:rebuilding dictionary, shrinking gaps
INFO:gensim.corpora.dictionary:resulting dictionary: Dictionary(41284 unique tokens: ['глава', 'достижение', 'запугивание', 'империалист', 'карибский']...)
DEBUG:gensim.corpora.dictionary:rebuilding dictionary, shrinking gaps


In [13]:
data['bows'] = data['fulltext'].apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
# print(data['bows'][0]) #what we have

# DTM model training

In [14]:
start = time.time()
model = DtmModel(
    dtm_path="C:/github/tt-weights/models/bin/dtm-win64.exe",
    corpus=data["bows"].values,
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to C:\Users\hryho\AppData\Local\Temp\8c1a76_train-mult.dat
INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus
INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into C:\Users\hryho\AppData\Local\Temp\8c1a76_train-mult.dat
DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\Users\\hryho\\AppData\\Local\\Temp\\8c1a76_train-mult.dat', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.corpora.bleicorpus:saving vocabulary of 41284 words to C:\Users\hryho\AppData\Local\Temp\8c1a76_train-mult.dat.vocab
DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\Users\\hryho\\AppData\\Local\\Temp\\8c1a76_train-mult.dat.vocab', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params

In [15]:
finish - start

12163.771802186966

In [16]:
model.save("../models/dtm_sample.model")

INFO:gensim.utils:saving DtmModel object under ../models/dtm_sample.model, separately None
INFO:gensim.utils:storing np array 'lambda_' to ../models/dtm_sample.model.lambda_.npy
INFO:gensim.utils:storing np array 'obs_' to ../models/dtm_sample.model.obs_.npy
DEBUG:smart_open.smart_open_lib:{'uri': '../models/dtm_sample.model', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.utils:saved ../models/dtm_sample.model


---

# topic weights over time

In [26]:
model = DtmModel.load("../models/dtm_sample.model")

INFO:gensim.utils:loading DtmModel object from ../models/dtm_sample.model
DEBUG:smart_open.smart_open_lib:{'uri': '../models/dtm_sample.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.utils:loading id2word recursively from ../models/dtm_sample.model.id2word.* with mmap=None
INFO:gensim.utils:loading lambda_ from ../models/dtm_sample.model.lambda_.npy with mmap=None
INFO:gensim.utils:loading obs_ from ../models/dtm_sample.model.obs_.npy with mmap=None
INFO:gensim.utils:loaded ../models/dtm_sample.model


In [27]:
time_seq_d = dates_cnt.set_index(dates_cnt["year"])["excerpt_count"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [28]:
topics.keys()

dict_keys([1972, 1983, 1984, 1985, 1997, 2000, 2001, 2002, 2003, 2008, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [29]:
len(topics[2010])

11834

In [30]:
len(topics[2010][0])

20

In [31]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [32]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year_pub
0,0.000331,0.000331,0.000331,0.252784,0.000331,0.578683,0.000331,0.000331,0.000331,0.000331,...,0.000331,0.000331,0.000331,0.000331,0.069186,0.000331,0.094048,0.000331,0.000331,1972
1,0.000549,0.000549,0.000549,0.171786,0.000549,0.818324,0.000549,0.000549,0.000549,0.000549,...,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,1972
2,0.000158,0.000158,0.000158,0.000158,0.000158,0.466169,0.000158,0.000158,0.123148,0.000158,...,0.000158,0.000158,0.000158,0.000158,0.039042,0.000158,0.311915,0.057353,0.000158,1972
3,0.000382,0.000382,0.000382,0.000382,0.000382,0.934142,0.000382,0.000382,0.000382,0.000382,...,0.000382,0.000382,0.000382,0.000382,0.058988,0.000382,0.000382,0.000382,0.000382,1972
4,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.846648,0.000704,...,0.000704,0.000704,0.000704,0.140676,0.000704,0.000704,0.000704,0.000704,0.000704,1972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11829,0.464519,0.000010,0.000010,0.015251,0.000010,0.100445,0.000010,0.056124,0.276976,0.028272,...,0.000010,0.000010,0.000010,0.036800,0.007817,0.013682,0.000010,0.000010,0.000010,2020
11830,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.428072,0.549977,0.001220,...,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,2020
11831,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.339408,0.000521,0.436167,0.215571,...,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,2020
11832,0.000083,0.000083,0.000083,0.013042,0.000083,0.000083,0.177878,0.000083,0.346696,0.000083,...,0.000083,0.000083,0.057151,0.000083,0.000083,0.403995,0.000083,0.000083,0.000083,2020


In [33]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [34]:
doc_topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,year_pub
0,0.000331,0.000331,0.000331,0.252784,0.000331,0.578683,0.000331,0.000331,0.000331,0.000331,...,0.000331,0.000331,0.000331,0.000331,0.069186,0.000331,0.094048,0.000331,0.000331,1972
1,0.000549,0.000549,0.000549,0.171786,0.000549,0.818324,0.000549,0.000549,0.000549,0.000549,...,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,1972
2,0.000158,0.000158,0.000158,0.000158,0.000158,0.466169,0.000158,0.000158,0.123148,0.000158,...,0.000158,0.000158,0.000158,0.000158,0.039042,0.000158,0.311915,0.057353,0.000158,1972
3,0.000382,0.000382,0.000382,0.000382,0.000382,0.934142,0.000382,0.000382,0.000382,0.000382,...,0.000382,0.000382,0.000382,0.000382,0.058988,0.000382,0.000382,0.000382,0.000382,1972
4,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.000704,0.846648,0.000704,...,0.000704,0.000704,0.000704,0.140676,0.000704,0.000704,0.000704,0.000704,0.000704,1972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11829,0.464519,0.000010,0.000010,0.015251,0.000010,0.100445,0.000010,0.056124,0.276976,0.028272,...,0.000010,0.000010,0.000010,0.036800,0.007817,0.013682,0.000010,0.000010,0.000010,2020
11830,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.428072,0.549977,0.001220,...,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,2020
11831,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.339408,0.000521,0.436167,0.215571,...,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,0.000521,2020
11832,0.000083,0.000083,0.000083,0.013042,0.000083,0.000083,0.177878,0.000083,0.346696,0.000083,...,0.000083,0.000083,0.057151,0.000083,0.000083,0.403995,0.000083,0.000083,0.000083,2020


In [35]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6635.0,,"Быков, О.Н.",Международные конфликты,,,Международные конфликты и империалистическое п...,,,"[результат, политика, запугивание, ядерный, пр...",...,0.000331,0.000331,0.000331,0.000331,0.000331,0.069186,0.000331,0.094048,0.000331,0.000331
1,6629.0,,"Богданов, Р.Г.",Наука,,,США: военная машина и политика,,http://militera.lib.ru/research/bogdanov_rg01/...,"[находить, отражение, общий, ориентация, амери...",...,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549
2,6604.0,,"Шрагин, В.",Изд-во полит. лит-ры,,,Терроризм - государственная политика США: Ваши...,,https://www.libex.ru/detail/book724081.html,"[многие, ярко_выраженный, агрессивный_действие...",...,0.000158,0.000158,0.000158,0.000158,0.000158,0.039042,0.000158,0.311915,0.057353,0.000158
3,6637.0,,"Серебрянников, В.В.; Рыбкин, И.Е.; Сливин, И.П...",Воениздат,,,США: курс на военное превосходство,,,"[смысл, ныне, американский, концепция, основа,...",...,0.000382,0.000382,0.000382,0.000382,0.000382,0.058988,0.000382,0.000382,0.000382,0.000382
4,6607.0,,"Попов, Ю.",Военный университет,,,Предисловие,,,"[предвидение, де_голль, будущий, изменение, ев...",...,0.000704,0.000704,0.000704,0.000704,0.140676,0.000704,0.000704,0.000704,0.000704,0.000704


In [36]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight
0,"Быков, О.Н.",,,"[результат, политика, запугивание, ядерный, пр...","Москва, СССР",Международные конфликты,Международные конфликты и империалистическое п...,,1972.0,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000331
1,"Богданов, Р.Г.",,,"[находить, отражение, общий, ориентация, амери...","Москва, СССР",Наука,США: военная машина и политика,http://militera.lib.ru/research/bogdanov_rg01/...,1983.0,"[(2, 1), (25, 1), (30, 1), (31, 1), (32, 1), (...",topic_0,0.000549
2,"Шрагин, В.",,,"[многие, ярко_выраженный, агрессивный_действие...","Москва, СССР",Изд-во полит. лит-ры,Терроризм - государственная политика США: Ваши...,https://www.libex.ru/detail/book724081.html,1984.0,"[(16, 1), (27, 1), (28, 1), (30, 1), (46, 1), ...",topic_0,0.000158


In [37]:
years = [*topics.keys()]

In [38]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [39]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,1972,1983,1984,1985,1997,2000,2001,2002,2003,2008,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"[(0.018, ядерный), (0.015, сдерживание), (0.01...","[(0.019, ядерный), (0.016, сдерживание), (0.01...","[(0.019, ядерный), (0.016, сдерживание), (0.01...","[(0.02, ядерный), (0.016, сдерживание), (0.013...","[(0.021, ядерный), (0.016, сдерживание), (0.01...","[(0.022, ядерный), (0.017, сдерживание), (0.01...","[(0.023, ядерный), (0.017, сдерживание), (0.01...","[(0.026, ядерный), (0.018, сдерживание), (0.01...","[(0.029, ядерный), (0.019, сдерживание), (0.01...","[(0.032, ядерный), (0.02, сдерживание), (0.017...",...,"[(0.026, ядерный), (0.017, сдерживание), (0.01...","[(0.021, ядерный), (0.017, стратегический), (0...","[(0.021, ядерный), (0.02, стратегический), (0....","[(0.024, ядерный), (0.017, стратегический), (0...","[(0.02, ядерный), (0.017, стратегический), (0....","[(0.019, ядерный), (0.012, стратегический), (0...","[(0.019, ядерный), (0.011, стратегический), (0...","[(0.022, ядерный), (0.011, система), (0.01, бо...","[(0.018, ядерный), (0.012, система), (0.011, р...","[(0.021, ядерный), (0.013, стратегический), (0..."
1,"[(0.172, грузия), (0.026, южный_осетия), (0.02...","[(0.176, грузия), (0.027, южный_осетия), (0.02...","[(0.182, грузия), (0.027, южный_осетия), (0.02...","[(0.19, грузия), (0.027, южный_осетия), (0.023...","[(0.201, грузия), (0.027, южный_осетия), (0.02...","[(0.213, грузия), (0.027, южный_осетия), (0.02...","[(0.229, грузия), (0.028, южный_осетия), (0.02...","[(0.248, грузия), (0.028, южный_осетия), (0.02...","[(0.274, грузия), (0.028, южный_осетия), (0.02...","[(0.308, грузия), (0.028, южный_осетия), (0.02...",...,"[(0.414, грузия), (0.022, южный_осетия), (0.02...","[(0.456, грузия), (0.019, южный_осетия), (0.01...","[(0.379, грузия), (0.022, украина), (0.02, южн...","[(0.25, грузия), (0.03, украина), (0.021, южны...","[(0.127, грузия), (0.039, украина), (0.02, южн...","[(0.055, грузия), (0.043, украина), (0.018, ющ...","[(0.037, украина), (0.027, грузия), (0.014, ющ...","[(0.031, украина), (0.014, грузия), (0.01, юще...","[(0.028, украина), (0.009, грузия), (0.007, бе...","[(0.026, украина), (0.008, грузия), (0.007, пр..."
2,"[(0.346, поддержка), (0.008, план), (0.005, ян...","[(0.349, поддержка), (0.008, план), (0.005, ян...","[(0.353, поддержка), (0.008, план), (0.005, ян...","[(0.357, поддержка), (0.008, план), (0.005, ян...","[(0.362, поддержка), (0.008, план), (0.005, ян...","[(0.367, поддержка), (0.008, план), (0.005, ян...","[(0.373, поддержка), (0.008, план), (0.005, ян...","[(0.381, поддержка), (0.008, план), (0.005, ян...","[(0.392, поддержка), (0.008, план), (0.005, ян...","[(0.407, поддержка), (0.008, план), (0.006, ян...",...,"[(0.381, поддержка), (0.008, план), (0.006, ян...","[(0.258, поддержка), (0.01, план), (0.008, укр...","[(0.128, поддержка), (0.012, план), (0.011, ук...","[(0.057, поддержка), (0.014, украина), (0.013,...","[(0.02, поддержка), (0.019, украина), (0.013, ...","[(0.024, украина), (0.015, янукович), (0.015, ...","[(0.025, украина), (0.012, тимошенко), (0.012,...","[(0.025, украина), (0.009, тимошенко), (0.009,...","[(0.027, украина), (0.008, тимошенко), (0.008,...","[(0.027, украина), (0.008, москва), (0.008, ти..."
3,"[(0.014, запугивание), (0.007, устрашение), (0...","[(0.014, запугивание), (0.007, устрашение), (0...","[(0.013, запугивание), (0.008, устрашение), (0...","[(0.012, запугивание), (0.008, устрашение), (0...","[(0.011, запугивание), (0.008, устрашение), (0...","[(0.01, запугивание), (0.008, устрашение), (0....","[(0.009, запугивание), (0.008, устрашение), (0...","[(0.009, запугивание), (0.008, устрашение), (0...","[(0.008, запугивание), (0.008, устрашение), (0...","[(0.008, устрашение), (0.008, запугивание), (0...",...,"[(0.009, советский), (0.009, устрашение), (0.0...","[(0.011, советский), (0.008, устрашение), (0.0...","[(0.013, советский), (0.006, ссср), (0.006, ус...","[(0.011, советский), (0.008, ссср), (0.006, ар...","[(0.012, советский), (0.01, ссср), (0.007, арм...","[(0.011, советский), (0.009, ссср), (0.006, ар...","[(0.013, советский), (0.009, ссср), (0.007, ар...","[(0.012, советский), (0.009, армия), (0.008, с...","[(0.011, армия), (0.01, советский), (0.008, сс...","[(0.013, советский), (0.009, ссср), (0.008, ар..."
4,"[(0.023, президент), (0.015, партия), (0.012, ...","[(0.023, президент), (0.015, партия), (0.012, ...","[(0.023, президент), (0.015, партия), (0.012, ...","[(0.024, президент), (0.015, партия), (0.012, ...","[(0.025, президент), (0.016, партия), (0.012, ...","[(0.025, президент), (0.016, партия), (0.012, ...","[(0.026, президент), (0.016, партия), (0.013, ...","[(0.027, президент), (0.016, партия), (0.013, ...","[(0.028, президент), (0.016, партия), (0.013, ...","[(0.03, президент), (0.016, партия), (0.014, в...",...,"[(0.043, президент), (0.022, путин), (0.021, в...","[(0.055, президент), (0.035, путин), (0.032, в...","[(0.057, президент), (0.041, путин), (0.037, в...","[(0.043, президент), (0.034, путин), (0.021, в...","[(0.037, президент), (0.029, путин), (0.015, у...","[(0.03, президент), (0.021, путин), (0.017, ук...","[(0.032, президент), (0.018, украина), (0.018,...","[(0.031, президент), (0.021, украина), (0.016,...","[(0.026, украина), (0.025, президент), (0.014,...","[(0.026, президент), (0.024, украина), (0.014,..."


In [40]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year
0,"[(0.018, ядерный), (0.015, сдерживание), (0.01...","[(0.172, грузия), (0.026, южный_осетия), (0.02...","[(0.346, поддержка), (0.008, план), (0.005, ян...","[(0.014, запугивание), (0.007, устрашение), (0...","[(0.023, президент), (0.015, партия), (0.012, ...","[(0.023, сша), (0.018, ядерный), (0.013, средс...","[(0.015, российский_федерация), (0.01, государ...","[(0.025, ядерный), (0.02, сдерживание), (0.016...","[(0.037, военный), (0.018, сдерживание), (0.01...","[(0.016, сша), (0.011, иран), (0.01, американс...",...,"[(0.015, средство_массовый_информация), (0.009...","[(0.011, армения), (0.008, конфликт), (0.008, ...","[(0.015, военный), (0.012, развитие), (0.012, ...","[(0.045, альянс), (0.042, нато), (0.04, европа...","[(0.025, военный), (0.019, сдерживание), (0.01...","[(0.037, армия), (0.018, стратегический), (0.0...","[(0.007, мировой), (0.006, обеспечить), (0.005...","[(0.007, президент), (0.006, стр), (0.004, пос...","[(0.031, украина), (0.016, украинский), (0.015...",1972
1,"[(0.019, ядерный), (0.016, сдерживание), (0.01...","[(0.176, грузия), (0.027, южный_осетия), (0.02...","[(0.349, поддержка), (0.008, план), (0.005, ян...","[(0.014, запугивание), (0.007, устрашение), (0...","[(0.023, президент), (0.015, партия), (0.012, ...","[(0.023, сша), (0.018, ядерный), (0.012, сдерж...","[(0.015, российский_федерация), (0.01, государ...","[(0.025, ядерный), (0.02, сдерживание), (0.016...","[(0.038, военный), (0.019, сдерживание), (0.01...","[(0.016, сша), (0.011, иран), (0.01, американс...",...,"[(0.015, средство_массовый_информация), (0.009...","[(0.011, армения), (0.008, конфликт), (0.008, ...","[(0.015, военный), (0.012, развитие), (0.012, ...","[(0.045, альянс), (0.044, нато), (0.04, европа...","[(0.025, военный), (0.019, сдерживание), (0.01...","[(0.037, армия), (0.018, стратегический), (0.0...","[(0.007, мировой), (0.006, обеспечить), (0.005...","[(0.007, президент), (0.006, стр), (0.004, пос...","[(0.032, украина), (0.016, украинский), (0.015...",1983
2,"[(0.019, ядерный), (0.016, сдерживание), (0.01...","[(0.182, грузия), (0.027, южный_осетия), (0.02...","[(0.353, поддержка), (0.008, план), (0.005, ян...","[(0.013, запугивание), (0.008, устрашение), (0...","[(0.023, президент), (0.015, партия), (0.012, ...","[(0.024, сша), (0.018, ядерный), (0.013, сдерж...","[(0.015, российский_федерация), (0.01, государ...","[(0.026, ядерный), (0.02, сдерживание), (0.016...","[(0.039, военный), (0.019, сдерживание), (0.01...","[(0.017, сша), (0.011, иран), (0.01, американс...",...,"[(0.015, средство_массовый_информация), (0.009...","[(0.011, армения), (0.008, зона), (0.008, конф...","[(0.016, военный), (0.012, развитие), (0.012, ...","[(0.047, нато), (0.046, альянс), (0.041, европ...","[(0.025, военный), (0.02, сдерживание), (0.014...","[(0.038, армия), (0.018, стратегический), (0.0...","[(0.007, мировой), (0.006, обеспечить), (0.005...","[(0.007, президент), (0.006, стр), (0.004, пос...","[(0.033, украина), (0.016, украинский), (0.015...",1984


In [41]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [42]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [52]:
topic_term_table

Unnamed: 0,year,topic_num,terms
0,1972,topic_0,"[(0.018, ядерный), (0.015, сдерживание), (0.01..."
1,1983,topic_0,"[(0.019, ядерный), (0.016, сдерживание), (0.01..."
2,1984,topic_0,"[(0.019, ядерный), (0.016, сдерживание), (0.01..."
3,1985,topic_0,"[(0.02, ядерный), (0.016, сдерживание), (0.013..."
4,1997,topic_0,"[(0.021, ядерный), (0.016, сдерживание), (0.01..."
...,...,...,...
415,2016,topic_19,"[(0.085, украина), (0.019, крым), (0.016, укра..."
416,2017,topic_19,"[(0.098, украина), (0.017, крым), (0.015, киев..."
417,2018,topic_19,"[(0.095, украина), (0.015, киев), (0.015, крым..."
418,2019,topic_19,"[(0.081, украина), (0.013, киев), (0.013, бело..."


In [43]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight,terms
0,"Быков, О.Н.",,,"[результат, политика, запугивание, ядерный, пр...","Москва, СССР",Международные конфликты,Международные конфликты и империалистическое п...,,1972.0,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000331,"[(0.018, ядерный), (0.015, сдерживание), (0.01..."
1,"Богданов, Р.Г.",,,"[находить, отражение, общий, ориентация, амери...","Москва, СССР",Наука,США: военная машина и политика,http://militera.lib.ru/research/bogdanov_rg01/...,1983.0,"[(2, 1), (25, 1), (30, 1), (31, 1), (32, 1), (...",topic_0,0.000549,"[(0.019, ядерный), (0.016, сдерживание), (0.01..."
2,"Шрагин, В.",,,"[многие, ярко_выраженный, агрессивный_действие...","Москва, СССР",Изд-во полит. лит-ры,Терроризм - государственная политика США: Ваши...,https://www.libex.ru/detail/book724081.html,1984.0,"[(16, 1), (27, 1), (28, 1), (30, 1), (46, 1), ...",topic_0,0.000158,"[(0.019, ядерный), (0.016, сдерживание), (0.01..."


In [44]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [45]:
topics_over_time = topics_over_time.loc[topics_over_time["fulltext"].notnull()].reset_index(drop=True).copy()

In [46]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [47]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [48]:
topics_over_time.to_json(
    "../data/processed/topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [49]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [50]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.018438,ядерный,1972,0
1,0.015479,сдерживание,1972,0
2,0.012266,стратегический,1972,0
3,0.009637,ракета,1972,0
4,0.00799,сша,1972,0


In [51]:
terms_by_time.to_csv("../data/processed/terms_over_time.csv", index=False)