In [1]:
import os
import time
import logging
import numpy as np
import pandas as pd
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.models.wrappers import DtmModel

In [2]:
logging.basicConfig(level=logging.DEBUG)

# Data preprocessing

In [3]:
data = pd.read_csv("./../data/raw/noun-phrases-without-adv.csv", converters={'np': eval})

In [4]:
data.shape

(7799, 22)

In [11]:
data["date"] = pd.to_datetime(data["date"])
data["year"] = data["date"].dt.year

In [13]:
data = data.loc[data["year"].notnull()].copy()

In [14]:
data["year"] = data["year"].astype(int)

In [15]:
data.shape

(7793, 23)

In [16]:
dates_cnt = data['year'].value_counts().rename_axis('year').reset_index(name='excerpt_count')
dates_cnt.sort_values('year', inplace = True, ascending = True)
time_seq = dates_cnt['excerpt_count'].to_list()

In [17]:
# #now we count the number of full texts in each years
# dates_count = data.groupby("year").agg(n_paragraphs = ("fulltext", "size"))
# dates_count = dates_count.loc[dates_count['n_paragraphs'].ne(0)].reset_index()
# dates_count["n_paragraphs"].sum()

In [18]:
# time_seq = dates_count["n_paragraphs"].to_list()
sum(time_seq)

7793

# Dictionary and BOWs

In [19]:
# create a dictionary
dictionary = corpora.Dictionary(data['np'])  
print(f'{len(dictionary)} tokens overall') 

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(524452 unique tokens: ['Algerian opposition leaders', 'Balkan leader', 'Beijing rule', 'Belgrade last week', 'Belgrade military hospital']...) from 7793 documents (total 1069049 corpus positions)


524452 tokens overall


In [21]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

INFO:gensim.corpora.dictionary:discarding 87055 tokens: [('Balkan leader', 1), ('Beijing rule', 1), ('Belgrade last week', 1), ('Belgrade military hospital', 1), ('Bosnian Serb headquarters', 1), ('Chinese prison', 1), ('Constable Glenn Schweyer', 1), ('EUROPE Official', 1), ('Garry Styles', 1), ('Julius Rose', 1)]...
INFO:gensim.corpora.dictionary:keeping 112945 tokens which were in no less than 2 and no more than 7715 (=99.0%) documents
DEBUG:gensim.corpora.dictionary:rebuilding dictionary, shrinking gaps
INFO:gensim.corpora.dictionary:resulting dictionary: Dictionary(112945 unique tokens: ['Algerian opposition leaders', 'Chinese authorities', 'Education Minister Domingo Palermo', 'European defense policy', 'French newspaper Le Monde']...)
DEBUG:gensim.corpora.dictionary:rebuilding dictionary, shrinking gaps


In [22]:
data['bows'] = data['np'].apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs
print(data['bows'][0]) #what we have

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]


# DTM model training

In [None]:
start = time.time()
model = DtmModel(
    dtm_path="../models/dtm/main",
    corpus=data["bows"].values,
    time_slices=time_seq,
    num_topics=20,
    id2word=dictionary,
    initialize_lda=True,
    top_chain_var=0.05
)
finish = time.time()

INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/cbb3a9_train-mult.dat
INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus
INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/cbb3a9_train-mult.dat
DEBUG:smart_open.smart_open_lib:{'uri': '/tmp/cbb3a9_train-mult.dat', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.corpora.bleicorpus:saving vocabulary of 112945 words to /tmp/cbb3a9_train-mult.dat.vocab
DEBUG:smart_open.smart_open_lib:{'uri': '/tmp/cbb3a9_train-mult.dat.vocab', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
DEBUG:smart_open.smart_open_lib:{'uri': '/tmp/cbb3a9_train-seq.dat', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': No

In [None]:
finish - start

In [15]:
finish - start

12163.771802186966

In [None]:
model.save("../models/dtm-noun-phrases.model")

---

# topic weights over time

In [None]:
model = DtmModel.load("../models/dtm-noun-phrases.model")

In [None]:
time_seq_d = dates_cnt.set_index(dates_cnt["year"])["excerpt_count"].to_dict()

topics = {}
for idx, year in enumerate(time_seq_d.keys()):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=idx,corpus=data['bows'])
    topics[year] = doc_topic

In [None]:
topics.keys()

In [None]:
len(topics[2010])

In [None]:
len(topics[2010][0])

In [31]:
# first_doc_index = 0
# for year, time in zip(time_seq_d.keys(), time_seq):
#     last_doc_index = first_doc_index + time
#     topics[f"{year}_seq"] = topics[year][first_doc_index:last_doc_index]
#     first_doc_index = first_doc_index + year_slice

In [None]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
    x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
    x['year_pub'] = k #create a column for year and assign the key value to it
    doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

In [None]:
doc_topic_matrix.columns = [f"topic_{c}" if isinstance(c, int) else c for c in doc_topic_matrix.columns]

In [None]:
doc_topic_matrix

In [None]:
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
topics_over_time.drop(columns = ['filename', 'year_pub'], inplace = True)

# topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

In [None]:
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'np','place','pubtitle', 'title', 'url', 
                                                    'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', ],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

In [None]:
years = [*topics.keys()]

In [None]:
term_topics_by_time = {}
for t, year in enumerate(years):
    # since we have 20 topics - range(20)
    topics = []
    for n in range(20): 
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.append(
            # round probability of each word to three values 1.111
            [(np.around(prob, 3), word) for prob, word in current_topic]
        )
    term_topics_by_time[year] = topics

In [None]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

In [None]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

In [None]:
topic_term_table.columns = [f"topic_{c}" if isinstance(c, int) else c for c in topic_term_table.columns]

In [None]:
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19'], 
                           var_name = 'topic_num', value_name = 'terms')

In [None]:
topic_term_table

In [None]:
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

In [None]:
topics_over_time.drop(columns = ['bows', 'doi'], inplace = True)

In [None]:
topics_over_time = topics_over_time.loc[topics_over_time["np"].notnull()].reset_index(drop=True).copy()

In [None]:
def simplify(l):
    return [f"{str(prob)} {word}" for prob, word in l]

In [None]:
topics_over_time["terms"] = topics_over_time["terms"].apply(simplify)

In [None]:
topics_over_time.to_json(
    "../data/processed/noun-phrases-topics_over_time.json", 
    lines=True,
    orient="records", 
    force_ascii=False, 
    date_format="iso"
)

# term weights over time

In [None]:
topics = []
for t, year in enumerate(years):
    for n in range(20):
        current_topic = model.show_topic(
            topicid=n,
            time=t,
            topn=15 # top 15 most salient terms
        )
        topics.extend(
            [list(term) + [year, n] for term in current_topic]
        )

In [None]:
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

In [None]:
terms_by_time.to_csv("../data/processed/noun-phrases-terms_over_time.csv", index=False)

In [None]:
terms_by_time.groupby(["year", "topic n"]).sum()