In [1]:
import os
import time
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

# Data preprocessing

In [2]:
data = pd.read_csv("./../data/processed/210202 - evp_integrum_ngrams.csv", converters={'fulltext': eval})

In [3]:
data.shape

(11836, 24)

In [4]:
data["year"] = data["year"].astype(int)

In [5]:
data["date"] = pd.to_datetime(data["date"])
data = data.loc[data["date"].notnull() & data["fulltext"].notnull()].copy()
data = data.loc[data["date"] <= "2021-01-01"].copy()
data = data.sort_values("date").reset_index(drop=True)
data = data.assign(year=data["date"].dt.year).drop("date", 1)
data.shape

(11834, 23)

In [6]:
# testing
np.random.seed(1)
data = data.sample(100).reset_index(drop=True)

In [7]:
dates_cnt = data['year'].value_counts().rename_axis('year').reset_index(name='excerpt_count')
dates_cnt.sort_values('year', inplace = True, ascending = True)
time_seq = dates_cnt['excerpt_count'].to_list()

In [8]:
# #now we count the number of full texts in each years
# dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
# dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
# dates_count["n_paragraphs"].sum()

In [9]:
# time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

100

# Dictionary and BOWs

In [10]:
# create a dictionary
dictionary = corpora.Dictionary(data['fulltext'])  
print(f'{len(dictionary)} tokens overall') 

3439 tokens overall


In [11]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

In [12]:
data['bows'] = data['fulltext'].apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
# print(data['bows'][0]) #what we have

# DTM model training

In [13]:
start = time.time()
model = DtmModel(
    dtm_path="../models/bin/dtm-win32.exe",
    corpus=data["bows"].values,
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

In [14]:
finish - start

95.16438865661621

In [15]:
model.save("../models/dtm_sample.model")

---

# topic weights over time

In [16]:
model = DtmModel.load("../models/dtm_sample.model")

In [17]:
time_seq_d = dates_cnt.set_index(dates_cnt["year"])["excerpt_count"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [18]:
topics.keys()

dict_keys([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [19]:
len(topics[2010])

100

In [20]:
len(topics[2011][0])

20

In [21]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [22]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year_pub
0,0.000472,0.000472,0.170214,0.000472,0.288755,0.218396,0.000472,0.000472,0.000472,0.000472,...,0.000472,0.315088,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,2010
1,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,...,0.000192,0.000192,0.000192,0.747520,0.000192,0.249032,0.000192,0.000192,0.000192,2010
2,0.000255,0.000255,0.078338,0.000255,0.000255,0.000255,0.000255,0.367720,0.000255,0.343290,...,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.206571,0.000255,2010
3,0.000549,0.000549,0.000549,0.378381,0.000549,0.366475,0.000549,0.000549,0.000549,0.000549,...,0.000549,0.000549,0.084866,0.000549,0.000549,0.000549,0.000549,0.000549,0.161486,2010
4,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.983036,0.000893,0.000893,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000221,0.000221,0.240123,0.060229,0.000221,0.000221,0.000221,0.000221,0.548497,0.000221,...,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,2020
96,0.000301,0.000301,0.000301,0.152542,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,...,0.000301,0.000301,0.000301,0.000301,0.522755,0.131959,0.146338,0.000301,0.041888,2020
97,0.000226,0.401738,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,0.179029,0.000226,...,0.000226,0.230891,0.000226,0.102612,0.000226,0.000226,0.000226,0.082337,0.000226,2020
98,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,...,0.000269,0.402396,0.000269,0.000269,0.140924,0.000269,0.000269,0.452111,0.000269,2020


In [23]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [24]:
doc_topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,year_pub
0,0.000472,0.000472,0.170214,0.000472,0.288755,0.218396,0.000472,0.000472,0.000472,0.000472,...,0.000472,0.315088,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,2010
1,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,...,0.000192,0.000192,0.000192,0.747520,0.000192,0.249032,0.000192,0.000192,0.000192,2010
2,0.000255,0.000255,0.078338,0.000255,0.000255,0.000255,0.000255,0.367720,0.000255,0.343290,...,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.206571,0.000255,2010
3,0.000549,0.000549,0.000549,0.378381,0.000549,0.366475,0.000549,0.000549,0.000549,0.000549,...,0.000549,0.000549,0.084866,0.000549,0.000549,0.000549,0.000549,0.000549,0.161486,2010
4,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.983036,0.000893,0.000893,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000221,0.000221,0.240123,0.060229,0.000221,0.000221,0.000221,0.000221,0.548497,0.000221,...,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,2020
96,0.000301,0.000301,0.000301,0.152542,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,...,0.000301,0.000301,0.000301,0.000301,0.522755,0.131959,0.146338,0.000301,0.041888,2020
97,0.000226,0.401738,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,0.179029,0.000226,...,0.000226,0.230891,0.000226,0.102612,0.000226,0.000226,0.000226,0.082337,0.000226,2020
98,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,...,0.000269,0.402396,0.000269,0.000269,0.140924,0.000269,0.000269,0.452111,0.000269,2020


In [25]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,3881.0,Полоса 02 .,,,,,,,,"[объединить_судостроительный_компания, оск, из...",...,0.000472,0.000472,0.315088,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472
1,186.0,"Россия, не верь, не бойся, не проси",,,,,,,,"[союзник, современный, концепция, гибридный, с...",...,0.000192,0.000192,0.000192,0.000192,0.74752,0.000192,0.249032,0.000192,0.000192,0.000192
2,3091.0,,IVAN DEM'IaNOV,Nash sovremennik,,,KAK BRAT'EV DELAIuT VRAGAMI,,https://dlib.eastview.com/browse/doc/45951056,"[разорвать, экономический, связь, европа, силь...",...,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.206571,0.000255
3,1499.0,,,Krasnaia zvezda,,,POSTINDUSTRIAL'NYI PROTIVNIK,,https://dlib.eastview.com/browse/doc/26505876,"[фактор, устойчивость, руководство, любой, сит...",...,0.000549,0.000549,0.000549,0.084866,0.000549,0.000549,0.000549,0.000549,0.000549,0.161486
4,3436.0,,Vladimir Dvorkin,Nezavisimoe voennoe obozrenie,,,"""TIaZhELAIa"" RAKETA DLIa STRATEGIChESKIKh IaDE...",,https://dlib.eastview.com/browse/doc/24447533,"[стационарный, группировка, мбр, середина_х, п...",...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.983036,0.000893,0.000893


In [26]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight
0,,Федеральная пресса,,"[объединить_судостроительный_компания, оск, из...",,,,,2012.0,"[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000472
1,,Федеральные интернет-издания,,"[союзник, современный, концепция, гибридный, с...",,,,,2015.0,"[(1, 2), (6, 2), (12, 1), (16, 1), (19, 1), (2...",topic_0,0.000192
2,IVAN DEM'IaNOV,UDB_EDU,,"[разорвать, экономический, связь, европа, силь...","Moscow,\n ...",Nash sovremennik,KAK BRAT'EV DELAIuT VRAGAMI,https://dlib.eastview.com/browse/doc/45951056,2015.0,"[(13, 2), (30, 1), (34, 1), (53, 1), (55, 1), ...",topic_0,0.000255


In [27]:
years = [*topics.keys()]

In [28]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [29]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"[(0.11, ядерный), (0.054, область), (0.052, во...","[(0.11, ядерный), (0.055, область), (0.053, ст...","[(0.109, ядерный), (0.057, область), (0.054, с...","[(0.105, ядерный), (0.062, область), (0.053, в...","[(0.101, ядерный), (0.067, область), (0.054, в...","[(0.095, ядерный), (0.069, область), (0.056, в...","[(0.092, ядерный), (0.072, область), (0.058, в...","[(0.09, ядерный), (0.076, область), (0.059, во...","[(0.089, ядерный), (0.08, область), (0.059, во...","[(0.088, ядерный), (0.082, область), (0.06, во...","[(0.088, ядерный), (0.082, область), (0.06, во..."
1,"[(0.059, сша), (0.045, рф), (0.043, сдерживани...","[(0.059, сша), (0.045, рф), (0.043, сдерживани...","[(0.06, сша), (0.045, рф), (0.043, сдерживание...","[(0.061, сша), (0.044, рф), (0.043, сдерживани...","[(0.058, сша), (0.04, сдерживание), (0.04, рф)...","[(0.057, сша), (0.038, сдерживание), (0.036, р...","[(0.052, сша), (0.038, сдерживание), (0.031, р...","[(0.048, сша), (0.039, сдерживание), (0.031, р...","[(0.046, сша), (0.039, сдерживание), (0.033, о...","[(0.044, сша), (0.04, сдерживание), (0.036, от...","[(0.042, сша), (0.04, сдерживание), (0.039, от..."
2,"[(0.096, американский), (0.071, ядерный), (0.0...","[(0.095, американский), (0.073, ядерный), (0.0...","[(0.098, американский), (0.075, ядерный), (0.0...","[(0.098, американский), (0.077, ядерный), (0.0...","[(0.097, американский), (0.079, ядерный), (0.0...","[(0.095, американский), (0.079, ядерный), (0.0...","[(0.085, ядерный), (0.085, американский), (0.0...","[(0.084, ядерный), (0.069, американский), (0.0...","[(0.077, ядерный), (0.06, американский), (0.05...","[(0.078, ядерный), (0.062, американский), (0.0...","[(0.075, ядерный), (0.063, американский), (0.0..."
3,"[(0.139, система), (0.077, сдерживание), (0.06...","[(0.135, система), (0.08, сдерживание), (0.064...","[(0.135, система), (0.082, сдерживание), (0.06...","[(0.142, система), (0.085, сдерживание), (0.06...","[(0.136, система), (0.087, сдерживание), (0.07...","[(0.128, система), (0.091, сдерживание), (0.08...","[(0.12, система), (0.095, стратегический), (0....","[(0.115, система), (0.101, стратегический), (0...","[(0.116, система), (0.098, стратегический), (0...","[(0.114, система), (0.1, стратегический), (0.0...","[(0.111, система), (0.102, стратегический), (0..."
4,"[(0.153, военный), (0.06, сдерживание), (0.055...","[(0.149, военный), (0.059, сдерживание), (0.05...","[(0.153, военный), (0.063, сдерживание), (0.05...","[(0.144, военный), (0.064, сдерживание), (0.05...","[(0.135, военный), (0.065, сдерживание), (0.06...","[(0.141, военный), (0.063, угроза), (0.063, сд...","[(0.15, военный), (0.064, угроза), (0.061, сде...","[(0.149, военный), (0.064, угроза), (0.061, сд...","[(0.149, военный), (0.064, угроза), (0.061, сд...","[(0.149, военный), (0.064, угроза), (0.061, сд...","[(0.149, военный), (0.064, угроза), (0.061, сд..."


In [30]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year
0,"[(0.11, ядерный), (0.054, область), (0.052, во...","[(0.059, сша), (0.045, рф), (0.043, сдерживани...","[(0.096, американский), (0.071, ядерный), (0.0...","[(0.139, система), (0.077, сдерживание), (0.06...","[(0.153, военный), (0.06, сдерживание), (0.055...","[(0.052, сдерживание), (0.05, военный), (0.041...","[(0.081, президент), (0.049, франция), (0.032,...","[(0.185, украина), (0.045, праздник), (0.038, ...","[(0.069, военный), (0.056, советский), (0.052,...","[(0.093, европа), (0.046, военный), (0.032, ев...",...,"[(0.044, ракетный), (0.042, ядерный), (0.037, ...","[(0.001, ракетный_войско), (0.001, рассматрива...","[(0.074, российский), (0.062, сдерживание), (0...","[(0.069, средство), (0.058, военный), (0.042, ...","[(0.054, система), (0.041, вс), (0.036, сдержи...","[(0.043, военный), (0.03, средство), (0.03, со...","[(0.122, сдерживание), (0.08, ядерный), (0.047...","[(0.041, государство), (0.039, российский_феде...","[(0.162, создать), (0.087, ядерный_оружие), (0...",2010
1,"[(0.11, ядерный), (0.055, область), (0.053, ст...","[(0.059, сша), (0.045, рф), (0.043, сдерживани...","[(0.095, американский), (0.073, ядерный), (0.0...","[(0.135, система), (0.08, сдерживание), (0.064...","[(0.149, военный), (0.059, сдерживание), (0.05...","[(0.052, сдерживание), (0.05, военный), (0.042...","[(0.084, президент), (0.05, франция), (0.031, ...","[(0.186, украина), (0.049, праздник), (0.035, ...","[(0.068, военный), (0.055, советский), (0.048,...","[(0.107, европа), (0.045, военный), (0.031, ев...",...,"[(0.044, ракетный), (0.042, ядерный), (0.037, ...","[(0.001, ракетный_войско), (0.001, рассматрива...","[(0.077, российский), (0.067, сдерживание), (0...","[(0.069, средство), (0.06, военный), (0.043, с...","[(0.054, система), (0.041, вс), (0.036, сдержи...","[(0.042, военный), (0.029, современный), (0.02...","[(0.118, сдерживание), (0.081, ядерный), (0.05...","[(0.04, государство), (0.04, российский_федера...","[(0.18, создать), (0.09, ядерный_оружие), (0.0...",2011
2,"[(0.109, ядерный), (0.057, область), (0.054, с...","[(0.06, сша), (0.045, рф), (0.043, сдерживание...","[(0.098, американский), (0.075, ядерный), (0.0...","[(0.135, система), (0.082, сдерживание), (0.06...","[(0.153, военный), (0.063, сдерживание), (0.05...","[(0.052, военный), (0.05, сдерживание), (0.043...","[(0.086, президент), (0.051, франция), (0.031,...","[(0.178, украина), (0.054, праздник), (0.033, ...","[(0.066, военный), (0.055, советский), (0.047,...","[(0.122, европа), (0.043, военный), (0.031, ев...",...,"[(0.044, ракетный), (0.042, ядерный), (0.037, ...","[(0.001, ракетный_войско), (0.001, рассматрива...","[(0.07, сдерживание), (0.067, ядерный), (0.066...","[(0.072, средство), (0.06, военный), (0.043, у...","[(0.055, система), (0.041, вс), (0.036, сдержи...","[(0.042, военный), (0.029, современный), (0.02...","[(0.116, сдерживание), (0.079, ядерный), (0.05...","[(0.041, российский_федерация), (0.04, государ...","[(0.198, создать), (0.092, ядерный_оружие), (0...",2012


In [31]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [32]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [33]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight,terms
0,,Федеральная пресса,,"[объединить_судостроительный_компания, оск, из...",,,,,2012.0,"[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000472,"[(0.109, ядерный), (0.057, область), (0.054, с..."
1,,Федеральные интернет-издания,,"[союзник, современный, концепция, гибридный, с...",,,,,2015.0,"[(1, 2), (6, 2), (12, 1), (16, 1), (19, 1), (2...",topic_0,0.000192,"[(0.095, ядерный), (0.069, область), (0.056, в..."
2,IVAN DEM'IaNOV,UDB_EDU,,"[разорвать, экономический, связь, европа, силь...","Moscow,\n ...",Nash sovremennik,KAK BRAT'EV DELAIuT VRAGAMI,https://dlib.eastview.com/browse/doc/45951056,2015.0,"[(13, 2), (30, 1), (34, 1), (53, 1), (55, 1), ...",topic_0,0.000255,"[(0.095, ядерный), (0.069, область), (0.056, в..."


In [34]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [35]:
topics_over_time = topics_over_time.loc[topics_over_time["fulltext"].notnull()].reset_index(drop=True).copy()

In [36]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [37]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [38]:
topics_over_time.to_json(
    "../data/processed/topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [39]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [40]:
topics[0]

[0.10996942255953009, 'ядерный', 2010, 0]

In [41]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.109969,ядерный,2010,0
1,0.054034,область,2010,0
2,0.052469,военный,2010,0
3,0.051954,стратегический,2010,0
4,0.051585,сдерживание,2010,0


In [42]:
terms_by_time.to_csv("../data/processed/terms_by_time.csv", index=False)