In [1]:
import os
import time
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

# Data preprocessing

In [2]:
# data = pd.read_json(
#     "../data/raw/210202_ru_deter_preprocessed.json",
#     orient="records",
#     convert_dates=False
# )

data = pd.read_csv("./../data/processed/data.csv", converters={'fulltext': eval})

In [3]:
data.shape

(11836, 24)

In [4]:
data["year"] = data["year"].astype(int)

In [5]:
data["date"] = pd.to_datetime(data["date"])
data = data.loc[data["date"].notnull() & data["fulltext"].notnull()].copy()
data = data.loc[data["date"] <= "2021-01-01"].copy()
data = data.sort_values("date").reset_index(drop=True)
data = data.assign(year=data["date"].dt.year).drop("date", 1)
data.shape

(11834, 23)

In [6]:
# I'm using sample of 10 docs because I can't run the whole corpus
# np.random.seed(1)
# data = data.sample(100).reset_index(drop=True)

In [7]:
dates_cnt = data['year'].value_counts().rename_axis('year').reset_index(name='excerpt_count')
dates_cnt.sort_values('year', inplace = True, ascending = True)
time_seq = dates_cnt['excerpt_count'].to_list()

In [8]:
# #now we count the number of full texts in each years
# dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
# dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
# dates_count["n_paragraphs"].sum()

In [9]:
# time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

100

# Dictionary and BOWs

In [10]:
# create a dictionary
dictionary = corpora.Dictionary(data['fulltext'])  
print(f'{len(dictionary)} tokens overall') 

3439 tokens overall


In [11]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

In [12]:
data['bows'] = data['fulltext'].apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
# print(data['bows'][0]) #what we have

# DTM model training

In [13]:
start = time.time()
model = DtmModel(
    dtm_path="../models/bin/dtm-win32.exe",
    corpus=data["bows"].values,
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

In [14]:
finish - start

93.15988159179688

In [15]:
model.save("../models/dtm_sample.model")

---

# topic weights over time

In [18]:
model = DtmModel.load("../models/dtm_sample.model")

In [22]:
time_seq_d = dates_cnt.set_index(dates_cnt["year"])["excerpt_count"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [23]:
topics.keys()

dict_keys([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [26]:
len(topics[2010])

100

In [28]:
len(topics[2011][0])

20

In [29]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [30]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year_pub
0,0.000472,0.000472,0.000472,0.000472,0.000472,0.629609,0.000472,0.000472,0.000472,0.000472,...,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,2010
1,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,...,0.749546,0.000192,0.000192,0.000192,0.000192,0.247006,0.000192,0.000192,0.000192,2010
2,0.221115,0.000255,0.000255,0.182778,0.000255,0.000255,0.490753,0.000255,0.000255,0.000255,...,0.000255,0.101272,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,2010
3,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.209476,...,0.000549,0.000549,0.000549,0.315379,0.000549,0.000549,0.000549,0.465805,0.000549,2010
4,0.000893,0.000893,0.000893,0.000893,0.983036,0.000893,0.000893,0.000893,0.000893,0.000893,...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,...,0.000221,0.767550,0.000221,0.000221,0.093079,0.000221,0.135610,0.000221,0.000221,2020
96,0.000301,0.646880,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,...,0.000301,0.000301,0.000301,0.000301,0.000301,0.184578,0.000301,0.000301,0.163422,2020
97,0.000226,0.000226,0.000226,0.124214,0.000226,0.000226,0.000226,0.000226,0.150528,0.574930,...,0.000226,0.146707,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,2020
98,0.000269,0.000269,0.000269,0.314754,0.000269,0.000269,0.519576,0.000269,0.161100,0.000269,...,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,2020


In [31]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [32]:
doc_topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,year_pub
0,0.000472,0.000472,0.000472,0.000472,0.000472,0.629609,0.000472,0.000472,0.000472,0.000472,...,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,2010
1,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,0.000192,...,0.749546,0.000192,0.000192,0.000192,0.000192,0.247006,0.000192,0.000192,0.000192,2010
2,0.221115,0.000255,0.000255,0.182778,0.000255,0.000255,0.490753,0.000255,0.000255,0.000255,...,0.000255,0.101272,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,2010
3,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.000549,0.209476,...,0.000549,0.000549,0.000549,0.315379,0.000549,0.000549,0.000549,0.465805,0.000549,2010
4,0.000893,0.000893,0.000893,0.000893,0.983036,0.000893,0.000893,0.000893,0.000893,0.000893,...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,0.000221,...,0.000221,0.767550,0.000221,0.000221,0.093079,0.000221,0.135610,0.000221,0.000221,2020
96,0.000301,0.646880,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,...,0.000301,0.000301,0.000301,0.000301,0.000301,0.184578,0.000301,0.000301,0.163422,2020
97,0.000226,0.000226,0.000226,0.124214,0.000226,0.000226,0.000226,0.000226,0.150528,0.574930,...,0.000226,0.146707,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,0.000226,2020
98,0.000269,0.000269,0.000269,0.314754,0.000269,0.000269,0.519576,0.000269,0.161100,0.000269,...,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,2020


In [33]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,uid,name,author,pubtitle,words,score,title,pages,url,fulltext,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,3881.0,Полоса 02 .,,,,,,,,"[объединить_судостроительный_компания, оск, из...",...,0.361901,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472
1,186.0,"Россия, не верь, не бойся, не проси",,,,,,,,"[союзник, современный, концепция, гибридный, с...",...,0.000192,0.749546,0.000192,0.000192,0.000192,0.000192,0.247006,0.000192,0.000192,0.000192
2,3091.0,,IVAN DEM'IaNOV,Nash sovremennik,,,KAK BRAT'EV DELAIuT VRAGAMI,,https://dlib.eastview.com/browse/doc/45951056,"[разорвать, экономический, связь, европа, силь...",...,0.000255,0.000255,0.101272,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255
3,1499.0,,,Krasnaia zvezda,,,POSTINDUSTRIAL'NYI PROTIVNIK,,https://dlib.eastview.com/browse/doc/26505876,"[фактор, устойчивость, руководство, любой, сит...",...,0.000549,0.000549,0.000549,0.000549,0.315379,0.000549,0.000549,0.000549,0.465805,0.000549
4,3436.0,,Vladimir Dvorkin,Nezavisimoe voennoe obozrenie,,,"""TIaZhELAIa"" RAKETA DLIa STRATEGIChESKIKh IaDE...",,https://dlib.eastview.com/browse/doc/24447533,"[стационарный, группировка, мбр, середина_х, п...",...,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893


In [34]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight
0,,Федеральная пресса,,"[объединить_судостроительный_компания, оск, из...",,,,,2012.0,"[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000472
1,,Федеральные интернет-издания,,"[союзник, современный, концепция, гибридный, с...",,,,,2015.0,"[(1, 2), (6, 2), (12, 1), (16, 1), (19, 1), (2...",topic_0,0.000192
2,IVAN DEM'IaNOV,UDB_EDU,,"[разорвать, экономический, связь, европа, силь...","Moscow,\n ...",Nash sovremennik,KAK BRAT'EV DELAIuT VRAGAMI,https://dlib.eastview.com/browse/doc/45951056,2015.0,"[(13, 2), (30, 1), (34, 1), (53, 1), (55, 1), ...",topic_0,0.221115


In [35]:
years = [*topics.keys()]

In [36]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [37]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"[(0.119, европа), (0.036, американский), (0.03...","[(0.115, европа), (0.036, американский), (0.03...","[(0.11, европа), (0.035, американский), (0.033...","[(0.106, европа), (0.036, американский), (0.03...","[(0.101, европа), (0.036, американский), (0.03...","[(0.097, европа), (0.035, американский), (0.03...","[(0.093, европа), (0.034, американский), (0.02...","[(0.089, европа), (0.034, американский), (0.02...","[(0.082, европа), (0.034, американский), (0.02...","[(0.076, европа), (0.034, американский), (0.02...","[(0.076, европа), (0.034, американский), (0.02..."
1,"[(0.072, ядерный_оружие), (0.064, оружие), (0....","[(0.07, ядерный_оружие), (0.061, оружие), (0.0...","[(0.067, ядерный_оружие), (0.059, оружие), (0....","[(0.068, ядерный_оружие), (0.055, ядерный), (0...","[(0.067, ядерный_оружие), (0.056, ядерный), (0...","[(0.062, ядерный_оружие), (0.056, ядерный), (0...","[(0.061, ядерный_оружие), (0.055, ядерный), (0...","[(0.059, ядерный_оружие), (0.053, ядерный), (0...","[(0.056, ядерный_оружие), (0.052, ядерный), (0...","[(0.054, ядерный_оружие), (0.052, оружие), (0....","[(0.054, оружие), (0.051, ядерный_оружие), (0...."
2,"[(0.081, сдерживание), (0.064, российский), (0...","[(0.081, сдерживание), (0.065, российский), (0...","[(0.079, сдерживание), (0.062, ядерный), (0.06...","[(0.073, сдерживание), (0.064, ядерный), (0.06...","[(0.069, ядерный), (0.067, сдерживание), (0.06...","[(0.075, ядерный), (0.066, сдерживание), (0.05...","[(0.077, ядерный), (0.064, сдерживание), (0.05...","[(0.081, ядерный), (0.06, сдерживание), (0.057...","[(0.082, ядерный), (0.058, нато), (0.057, сдер...","[(0.087, ядерный), (0.055, сдерживание), (0.05...","[(0.089, ядерный), (0.056, сдерживание), (0.05..."
3,"[(0.037, российский_федерация), (0.032, деятел...","[(0.039, российский_федерация), (0.031, интере...","[(0.04, российский_федерация), (0.029, интерес...","[(0.042, российский_федерация), (0.029, госуда...","[(0.043, российский_федерация), (0.029, госуда...","[(0.047, российский_федерация), (0.03, государ...","[(0.044, российский_федерация), (0.029, госуда...","[(0.041, российский_федерация), (0.029, госуда...","[(0.039, российский_федерация), (0.029, госуда...","[(0.037, российский_федерация), (0.031, регион...","[(0.036, российский_федерация), (0.035, регион..."
4,"[(0.08, сдерживание), (0.065, возможность), (0...","[(0.078, сдерживание), (0.062, возможность), (...","[(0.073, сдерживание), (0.066, возможность), (...","[(0.07, возможность), (0.068, сдерживание), (0...","[(0.074, возможность), (0.07, государство), (0...","[(0.08, возможность), (0.072, государство), (0...","[(0.083, возможность), (0.076, государство), (...","[(0.086, возможность), (0.08, государство), (0...","[(0.09, возможность), (0.084, государство), (0...","[(0.095, возможность), (0.085, государство), (...","[(0.095, возможность), (0.085, государство), (..."


In [38]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,year
0,"[(0.119, европа), (0.036, американский), (0.03...","[(0.072, ядерный_оружие), (0.064, оружие), (0....","[(0.081, сдерживание), (0.064, российский), (0...","[(0.037, российский_федерация), (0.032, деятел...","[(0.08, сдерживание), (0.065, возможность), (0...","[(0.099, военный), (0.059, армия), (0.036, куб...","[(0.107, украина), (0.041, европейский), (0.03...","[(0.071, президент), (0.046, франция), (0.044,...","[(0.1, военный), (0.081, сдерживание), (0.057,...","[(0.035, отношение), (0.035, система), (0.029,...",...,"[(0.092, военный), (0.054, потенциал), (0.042,...","[(0.062, стратегический), (0.057, сша), (0.051...","[(0.142, американский), (0.115, противоракетны...","[(0.077, ракетный), (0.056, американец), (0.04...","[(0.065, ядерный), (0.062, вашингтон), (0.049,...","[(0.043, военный), (0.036, средство), (0.029, ...","[(0.112, система), (0.057, сдерживание), (0.04...","[(0.023, российский), (0.022, китай), (0.02, с...","[(0.124, рф), (0.084, безопасность), (0.083, с...",2010
1,"[(0.115, европа), (0.036, американский), (0.03...","[(0.07, ядерный_оружие), (0.061, оружие), (0.0...","[(0.081, сдерживание), (0.065, российский), (0...","[(0.039, российский_федерация), (0.031, интере...","[(0.078, сдерживание), (0.062, возможность), (...","[(0.095, военный), (0.06, армия), (0.038, куба...","[(0.097, украина), (0.041, европейский), (0.03...","[(0.074, президент), (0.046, франция), (0.043,...","[(0.107, военный), (0.078, сдерживание), (0.05...","[(0.037, система), (0.037, отношение), (0.028,...",...,"[(0.1, военный), (0.059, потенциал), (0.043, с...","[(0.06, стратегический), (0.057, сша), (0.052,...","[(0.143, американский), (0.115, противоракетны...","[(0.083, ракетный), (0.051, американец), (0.04...","[(0.066, ядерный), (0.06, вашингтон), (0.048, ...","[(0.043, военный), (0.034, средство), (0.028, ...","[(0.11, система), (0.057, сдерживание), (0.04,...","[(0.024, российский), (0.022, китай), (0.021, ...","[(0.124, рф), (0.084, безопасность), (0.083, с...",2011
2,"[(0.11, европа), (0.035, американский), (0.033...","[(0.067, ядерный_оружие), (0.059, оружие), (0....","[(0.079, сдерживание), (0.062, ядерный), (0.06...","[(0.04, российский_федерация), (0.029, интерес...","[(0.073, сдерживание), (0.066, возможность), (...","[(0.091, военный), (0.061, армия), (0.041, куб...","[(0.088, украина), (0.041, европейский), (0.03...","[(0.078, президент), (0.047, франция), (0.039,...","[(0.107, военный), (0.073, сдерживание), (0.06...","[(0.038, система), (0.038, отношение), (0.03, ...",...,"[(0.096, военный), (0.062, потенциал), (0.045,...","[(0.058, сша), (0.055, стратегический), (0.054...","[(0.145, американский), (0.117, противоракетны...","[(0.089, ракетный), (0.048, праздник), (0.047,...","[(0.065, ядерный), (0.057, вашингтон), (0.046,...","[(0.043, военный), (0.033, средство), (0.028, ...","[(0.11, система), (0.056, сдерживание), (0.041...","[(0.023, российский), (0.022, китай), (0.021, ...","[(0.118, рф), (0.086, ядерный), (0.086, безопа...",2012


In [39]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [40]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [41]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,year,bows,topic_num,topic_weight,terms
0,,Федеральная пресса,,"[объединить_судостроительный_компания, оск, из...",,,,,2012.0,"[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.000472,"[(0.11, европа), (0.035, американский), (0.033..."
1,,Федеральные интернет-издания,,"[союзник, современный, концепция, гибридный, с...",,,,,2015.0,"[(1, 2), (6, 2), (12, 1), (16, 1), (19, 1), (2...",topic_0,0.000192,"[(0.097, европа), (0.035, американский), (0.03..."
2,IVAN DEM'IaNOV,UDB_EDU,,"[разорвать, экономический, связь, европа, силь...","Moscow,\n ...",Nash sovremennik,KAK BRAT'EV DELAIuT VRAGAMI,https://dlib.eastview.com/browse/doc/45951056,2015.0,"[(13, 2), (30, 1), (34, 1), (53, 1), (55, 1), ...",topic_0,0.221115,"[(0.097, европа), (0.035, американский), (0.03..."


In [42]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [43]:
topics_over_time = topics_over_time.loc[topics_over_time["fulltext"].notnull()].reset_index(drop=True).copy()

In [44]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [45]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [46]:
topics_over_time.to_json(
    "../data/processed/topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [47]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [48]:
topics[0]

[0.11896005110378534, 'европа', 2010, 0]

In [49]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.11896,европа,2010,0
1,0.036168,американский,2010,0
2,0.031486,вс,2010,0
3,0.02512,военный,2010,0
4,0.012856,ядерный_оружие,2010,0


In [50]:
terms_by_time.to_csv("../data/processed/terms_by_time.csv", index=False)