In [29]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import json
import string
from top2vec import Top2Vec
import multiprocessing
from scipy.special import softmax
from wordcloud import WordCloud
import os

In [3]:
rz = pd.read_parquet('../data/raw/RZ_processed.parquet')

In [42]:
df = pd.read_csv('../data/processed_data.tsv', sep='\t', encoding='utf8').convert_dtypes()
df.doc_date = pd.to_datetime(df.doc_date)
df.origin_date = pd.to_datetime(df.origin_date)
df['doc_year'] = df.doc_date.dt.year

In [5]:
places = pd.read_csv('../data/places/places.tsv', sep='\t', encoding='utf8')

In [22]:
t2v = Top2Vec.load('../data/models/t2v_211122_100_deep.pkl')

In [2]:
def define_plot_fonts():
    fm.fontManager.addfont('../references/cmunorm.ttf')
    matplotlib.rc('xtick', labelsize=14) 
    matplotlib.rc('ytick', labelsize=14)
    matplotlib.rcParams['font.family'] = 'CMU Concrete'

In [8]:
def get_averages(place):
    
    return df[df.placename == place].groupby('doc_year')['delta'].mean().round(2).reindex(range(1802, 1889))

In [12]:
def create_streamlit_df():
    
    data = {}
    
    for place in tqdm(places.placename.values):
        data[place] = (get_averages(place).values)
        
    return pd.DataFrame(data, index=range(1802, 1889))

In [15]:
streamlit_df = create_streamlit_df()
streamlit_df.to_csv('../streamlit/data/streamlit_data.tsv', sep='\t')

100%|████████████████████████████████████████████████████████████████████████████████| 351/351 [00:10<00:00, 33.36it/s]


In [17]:
def create_placename_counts():
    placename_counts = {place: len(df[df.placename == place]) for place in places.placename}
    with open('../streamlit/data/placename_counts.json', 'w', encoding='utf8') as f:
        json.dump(placename_counts, f)

In [18]:
create_placename_counts()

In [None]:
def get_season(origin_date):
    if origin_date.month in range(4, 10):
        return 's'
    elif origin_date.month in [10, 11, 12, 1, 2, 3]:
        return 'w'

def write_place_files():
    
    for name in tqdm(places.placename.values):
        place_df = pd.DataFrame(df.loc[df.placename == name, ['origin_date', 'doc_year', 'delta']].values,
                        columns=['origin_date', 'year', 'delta'])
        place_df['season'] = place_df.origin_date.apply(get_season)
        place_df = place_df[['year', 'delta', 'season']]
        place_df.to_csv(f'../streamlit/data/places/{name}.tsv', sep='\t', encoding='utf8', index=False) 

In [None]:
write_place_files()

In [26]:
def custom_topic_wordcloud(top, savepath=None, show=False):
    
    cloudwidth = 2400
    cloudheight = 800
    
    topic_words = t2v.get_topics(reduced=True)[0][top]
    word_scores = t2v.get_topics(reduced=True)[1][top]
    
    topic_words_dict = dict(zip(topic_words, softmax(word_scores)))
    
    wc = WordCloud(background_color='white', width=cloudwidth, height=cloudheight,
                   font_path='../references/cmunrm.ttf')
    
    wc.generate_from_frequencies(topic_words_dict)
    
    plt.figure(figsize=(12, 4))
    plt.imshow(wc)
    plt.axis('off')
        
    plt.tight_layout()
    
    if savepath:
        plt.savefig(f'{savepath}\\{top}.png', bbox_inches='tight')
        
    if show:
        plt.show()
    else:
        plt.clf()

In [27]:
def get_topic_stats(top, savepath=None, show=False):
    
    topic_size = t2v.get_topic_sizes(reduced=True)[0][top]
    topic_document_ids = t2v.search_documents_by_topic(top, topic_size, reduced=True)[2]
    
    doc_ids = [int(ID.split('_')[1]) for ID in topic_document_ids]
    
    top_df = rz.loc[doc_ids]
    
    define_plot_fonts()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    top_df.year.hist(bins=87, ax=ax1)
    ax1.set_xlim(1802, 1889)
    
    top_df.heading2.value_counts()[:10].plot.bar(ax=ax2)
    ax2.set_xticklabels(labels=top_df.heading2.value_counts()[:10].index, rotation=45, ha='right')
        
    if savepath:
        plt.savefig(f'{savepath}\\topic_{str(top)}.png', bbox_inches='tight')
    
    if show:
        plt.show()
    else:
        plt.clf()

In [38]:
def get_topic_examples(top, n, savepath=None):
    
    examples = t2v.search_documents_by_topic(top, n, reduced=True)[2]
    
    msg_ids = [int(ex.split('_')[0]) for ex in examples]
    doc_ids = [int(ex.split('_')[1]) for ex in examples]
        
    example_texts = []
    
    for msg_id, doc_id in zip(msg_ids, doc_ids):
        article = df.loc[df.doc_id == doc_id]
        return article
        
        if len(article) == 1:# or article.index[-1] == msg_id:
            span_start = article.start.values[0]
            span_end = -1

        elif len(article) > 1:
            span_start = article.loc[msg_id, 'start']
            span_end = article.loc[msg_id+1, 'start']
            
        example_texts.append(
                                {"msg_id": msg_id,
                                 "doc_id": doc_id,
                                 "date": rz.loc[doc_id, 'date'],
                                 "heading": rz.loc[doc_id, 'heading'],
                                 "text": rz.loc[doc_id, 'full_text'][span_start:span_end]
                                }
                            )
        
    if savepath:
        with open(savepath+'\\examples.json', 'w', encoding='utf8') as f:
            json.dump(example_texts, f)
            
    return example_texts

In [39]:
get_topic_examples(40, 4)

([86008, 79929, 86138, 126369], [123484, 117754, 123611, 153437])

In [44]:
123484 in df.doc_id.values

False

In [41]:
df.loc[df.doc_id == 123484]

Unnamed: 0,doc_id,doc_date,placename,day,day2,month,month2,origin_year,start,end,origin_date,delta,doc_year


In [None]:
def create_topic_data_for_streamlit(reduction):
    
    directory = f'C:\\Users\\krister\\py_projects\\rz_acta\\streamlit\\data\\topics\\reduction_{reduction}'    
    if not os.path.exists(directory):
        os.mkdir(directory)
    
    
    print(f'Performing reduction to {reduction} topics')
    t2v.hierarchical_topic_reduction(reduction)
    
    print('Generating wordclouds')
    if not os.path.exists(directory+'\\wordclouds'):
        os.mkdir(directory+'\\wordclouds')
        
    for top in tqdm(range(reduction)):
        custom_topic_wordcloud(top, show=False, savepath=directory+'\\wordclouds')
        
    
    print('Generating statistics')
    if not os.path.exists(directory+'\\statistics'):
        os.mkdir(directory+'\\statistics')
        
    for top in tqdm(range(reduction)):
        get_topic_stats(top, show=False, savepath=directory+'\\statistics')
        
        
    print('Fetching examples')
    if not os.path.exists(directory+'\\examples'):
        os.mkdir(directory+'\\examples')
        
    for top in tqdm(range(reduction)):
        get_topic_examples(top, 10, directory+'\\examples')
        
        
    print('Finished')