In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import json
import string
from top2vec import Top2Vec
import multiprocessing
from scipy.special import softmax
from wordcloud import WordCloud
import os

In [2]:
rz = pd.read_parquet('../data/raw/RZ_processed.parquet')

In [3]:
df = pd.read_csv('../data/processed_data.tsv', sep='\t', encoding='utf8').convert_dtypes()
df.doc_date = pd.to_datetime(df.doc_date)
df.origin_date = pd.to_datetime(df.origin_date)
df['doc_year'] = df.doc_date.dt.year

In [4]:
places = pd.read_csv('../data/places/places.tsv', sep='\t', encoding='utf8')

In [47]:
t2v = Top2Vec.load('../data/models/t2v_211122_100_deep.pkl')

In [2]:
def define_plot_fonts():
    fm.fontManager.addfont('../references/cmunorm.ttf')
    matplotlib.rc('xtick', labelsize=14) 
    matplotlib.rc('ytick', labelsize=14)
    matplotlib.rcParams['font.family'] = 'CMU Concrete'

In [8]:
def get_averages(place):
    
    return df[df.placename == place].groupby('doc_year')['delta'].mean().round(2).reindex(range(1802, 1889))

In [12]:
def create_streamlit_df():
    
    data = {}
    
    for place in tqdm(places.placename.values):
        data[place] = (get_averages(place).values)
        
    return pd.DataFrame(data, index=range(1802, 1889))

In [15]:
streamlit_df = create_streamlit_df()
streamlit_df.to_csv('../streamlit/data/streamlit_data.tsv', sep='\t')

100%|████████████████████████████████████████████████████████████████████████████████| 351/351 [00:10<00:00, 33.36it/s]


In [17]:
def create_placename_counts():
    placename_counts = {place: len(df[df.placename == place]) for place in places.placename}
    with open('../streamlit/data/placename_counts.json', 'w', encoding='utf8') as f:
        json.dump(placename_counts, f)

In [18]:
create_placename_counts()

In [None]:
def get_season(origin_date):
    if origin_date.month in range(4, 10):
        return 's'
    elif origin_date.month in [10, 11, 12, 1, 2, 3]:
        return 'w'

def write_place_files():
    
    for name in tqdm(places.placename.values):
        place_df = pd.DataFrame(df.loc[df.placename == name, ['origin_date', 'doc_year', 'delta']].values,
                        columns=['origin_date', 'year', 'delta'])
        place_df['season'] = place_df.origin_date.apply(get_season)
        place_df = place_df[['year', 'delta', 'season']]
        place_df.to_csv(f'../streamlit/data/places/{name}.tsv', sep='\t', encoding='utf8', index=False) 

In [None]:
write_place_files()

In [26]:
def custom_topic_wordcloud(top, savepath=None, show=False):
    
    cloudwidth = 2400
    cloudheight = 800
    
    topic_words = t2v.get_topics(reduced=True)[0][top]
    word_scores = t2v.get_topics(reduced=True)[1][top]
    
    topic_words_dict = dict(zip(topic_words, softmax(word_scores)))
    
    wc = WordCloud(background_color='white', width=cloudwidth, height=cloudheight,
                   font_path='../references/cmunrm.ttf')
    
    wc.generate_from_frequencies(topic_words_dict)
    
    plt.figure(figsize=(12, 4))
    plt.imshow(wc)
    plt.axis('off')
        
    plt.tight_layout()
    
    if savepath:
        plt.savefig(f'{savepath}\\{top}.png', bbox_inches='tight')
        
    if show:
        plt.show()
    else:
        plt.clf()

In [27]:
def get_topic_stats(top, savepath=None, show=False):
    
    topic_size = t2v.get_topic_sizes(reduced=True)[0][top]
    topic_document_ids = t2v.search_documents_by_topic(top, topic_size, reduced=True)[2]
    
    doc_ids = [int(ID.split('_')[1]) for ID in topic_document_ids]
    
    top_df = rz.loc[doc_ids]
    
    define_plot_fonts()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    top_df.year.hist(bins=87, ax=ax1)
    ax1.set_xlim(1802, 1889)
    
    top_df.heading2.value_counts()[:10].plot.bar(ax=ax2)
    ax2.set_xticklabels(labels=top_df.heading2.value_counts()[:10].index, rotation=45, ha='right')
        
    if savepath:
        plt.savefig(f'{savepath}\\topic_{str(top)}.png', bbox_inches='tight')
    
    if show:
        plt.show()
    else:
        plt.clf()

In [31]:
def get_topic_examples(top, n, savepath=None):
    
    examples = t2v.search_documents_by_topic(top, n, reduced=True)[2]
    
    msg_ids = [int(ex.split('_')[0]) for ex in examples]
    doc_ids = [int(ex.split('_')[1]) for ex in examples]
        
    example_texts = []
    
    for msg_id, doc_id in zip(msg_ids, doc_ids):
        article = df.loc[df.doc_id == doc_id]
        
        if len(article) == 1:
            span_start = article.start.values[0]
            span_end = -1
    
        elif article.index[-1] == msg_id:
            print('here')
            span_start = article.loc[msg_id, 'start']
            span_end = -1
            
        elif len(article) > 1:
            span_start = article.loc[msg_id, 'start']
            span_end = article.loc[msg_id+1, 'start']
            print(span_start, span_end)
                    
        example_texts.append(
                                {"msg_id": msg_id,
                                 "doc_id": doc_id,
                                 "date": rz.loc[doc_id, 'date'],
                                 "heading": rz.loc[doc_id, 'heading'],
                                 "text": rz.loc[doc_id, 'full_text'][span_start:span_end]
                                }
                            )
        
    if savepath:
        with open(savepath+f'\\examples_{top}.json', 'w', encoding='utf8') as f:
            json.dump(example_texts, f)
    else:
        return example_texts

In [82]:
def create_topic_data_for_streamlit(reduction):
    
    directory = f'C:\\Users\\krister\\py_projects\\rz_acta\\streamlit\\data\\topics\\reduction_{reduction}'    
    if not os.path.exists(directory):
        os.mkdir(directory)
    
    
    print(f'Performing reduction to {reduction} topics')
    t2v.hierarchical_topic_reduction(reduction)
    
    
    print('Generating wordclouds')
    if not os.path.exists(directory+'\\wordclouds'):
        os.mkdir(directory+'\\wordclouds')
        
    for top in tqdm(range(reduction)):
        custom_topic_wordcloud(top, show=False, savepath=directory+'\\wordclouds')
        
    
    print('Generating statistics')
    if not os.path.exists(directory+'\\statistics'):
        os.mkdir(directory+'\\statistics')
        
    for top in tqdm(range(reduction)):
        get_topic_stats(top, show=False, savepath=directory+'\\statistics')
        
        
    print('Fetching examples')
    if not os.path.exists(directory+'\\examples'):
        os.mkdir(directory+'\\examples')
        
    for top in tqdm(range(reduction)):
        get_topic_examples(top, 10, directory+'\\examples')
        
        
    with open(directory+'\\sizes.json', 'w', encoding='utf8') as f:
        sizes = [int(i) for i in list(t2v.get_topic_sizes(reduced=True)[0])]
        json.dump(sizes, f)
        
        
    print('Finished')

In [83]:
create_topic_data_for_streamlit(15)

Performing reduction to 15 topics
Generating wordclouds


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:11<00:00,  4.80s/it]


Generating statistics


  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:14<00:00,  1.04it/s]


Fetching examples


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 27.87it/s]

Finished





<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

In [84]:
create_topic_data_for_streamlit(30)

Performing reduction to 30 topics
Generating wordclouds


  plt.figure(figsize=(12, 4))
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [02:21<00:00,  4.70s/it]


Generating statistics


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:35<00:00,  1.17s/it]


Fetching examples


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 24.88it/s]

Finished





<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

In [79]:
create_topic_data_for_streamlit(60)

Performing reduction to 60 topics
Generating wordclouds


  plt.figure(figsize=(12, 4))
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [05:31<00:00,  5.52s/it]


Generating statistics


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [01:15<00:00,  1.25s/it]


Fetching examples


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 27.66it/s]


Finished


<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

<Figure size 1200x400 with 0 Axes>

In [34]:
lst = []
lst.append([3,4,5])
lst.append([3,4,7])
lst

[[3, 4, 5], [3, 4, 7]]

In [37]:
t2v.hierarchical_topic_reduction(72)

[[0],
 [141, 145, 165, 2],
 [83, 1],
 [173, 100, 119, 164, 235, 9],
 [129, 146, 177, 5],
 [253, 325, 204, 180, 339, 106, 307, 232, 322, 187, 3],
 [89, 94, 50],
 [71, 6],
 [67, 72, 118, 8],
 [47, 230, 318, 112, 261, 80, 182, 13],
 [239, 104, 4],
 [59, 48, 90, 53],
 [181, 10],
 [117, 183, 11],
 [255, 58, 55, 262, 249, 131, 174, 192, 39],
 [157, 267, 69, 144, 190, 105, 176, 300, 217, 225, 248, 56],
 [333,
  228,
  288,
  308,
  186,
  263,
  266,
  312,
  324,
  199,
  241,
  354,
  299,
  275,
  349,
  178,
  366,
  329,
  379,
  301,
  305,
  155],
 [137, 130, 96, 12],
 [378,
  269,
  196,
  367,
  278,
  224,
  359,
  216,
  294,
  370,
  203,
  360,
  320,
  343,
  364,
  311,
  338,
  348,
  380,
  317,
  197,
  374,
  298,
  234,
  327,
  287,
  309,
  304,
  356,
  363,
  350,
  351,
  340,
  358,
  169],
 [140, 218, 86, 92, 236, 161, 123, 207, 168, 111, 132, 70],
 [7],
 [285, 84, 97, 171, 19],
 [211, 254, 61, 337, 150, 49],
 [122, 65, 18],
 [20],
 [24],
 [293,
  330,
  371,
  357,

In [38]:
h71_1 = t2v.get_topic_hierarchy()

In [48]:
t2v.hierarchical_topic_reduction(72)

[[0],
 [141, 145, 165, 2],
 [83, 1],
 [173, 100, 119, 164, 235, 9],
 [129, 146, 177, 5],
 [253, 325, 204, 180, 339, 106, 307, 232, 322, 187, 3],
 [89, 94, 50],
 [71, 6],
 [67, 72, 118, 8],
 [47, 230, 318, 112, 261, 80, 182, 13],
 [239, 104, 4],
 [59, 48, 90, 53],
 [181, 10],
 [117, 183, 11],
 [255, 58, 55, 262, 249, 131, 174, 192, 39],
 [157, 267, 69, 144, 190, 105, 176, 300, 217, 225, 248, 56],
 [333,
  228,
  288,
  308,
  186,
  263,
  266,
  312,
  324,
  199,
  241,
  354,
  299,
  275,
  349,
  178,
  366,
  329,
  379,
  301,
  305,
  155],
 [137, 130, 96, 12],
 [378,
  269,
  196,
  367,
  278,
  224,
  359,
  216,
  294,
  370,
  203,
  360,
  320,
  343,
  364,
  311,
  338,
  348,
  380,
  317,
  197,
  374,
  298,
  234,
  327,
  287,
  309,
  304,
  356,
  363,
  350,
  351,
  340,
  358,
  169],
 [140, 218, 86, 92, 236, 161, 123, 207, 168, 111, 132, 70],
 [7],
 [285, 84, 97, 171, 19],
 [211, 254, 61, 337, 150, 49],
 [122, 65, 18],
 [20],
 [24],
 [293,
  330,
  371,
  357,

In [49]:
h71_2 = t2v.get_topic_hierarchy()

In [50]:
h71_1 == h71_2

True

In [77]:
t2v.get_topic_hierarchy()[-1]

[151, 147, 77]

In [82]:
size = 0

for subtop in [151, 147, 77]:
    subtop_size = t2v.get_topic_sizes()[0][subtop]
    print(subtop_size)
    size += subtop_size
    
size

499
521
980


2000

In [83]:
size in t2v.get_topic_sizes(reduced=True)[0]

False

In [72]:
sum([555, 524, 435, 3607])

5121

In [69]:
t2v.get_topic_sizes(reduced=False)

(array([4700, 4609, 3607, 3448, 3361, 3294, 3140, 3020, 2944, 2909, 2701,
        2688, 2548, 2335, 2303, 2291, 2290, 2244, 2154, 2082, 2059, 1985,
        1893, 1870, 1765, 1713, 1706, 1686, 1621, 1610, 1574, 1525, 1505,
        1476, 1454, 1426, 1400, 1385, 1381, 1375, 1361, 1354, 1348, 1334,
        1279, 1278, 1259, 1245, 1204, 1197, 1188, 1185, 1160, 1156, 1151,
        1149, 1144, 1136, 1130, 1116, 1110, 1107, 1106, 1098, 1079, 1078,
        1069, 1065, 1059, 1057, 1042, 1033, 1023, 1013, 1002,  987,  983,
         980,  953,  949,  934,  931,  928,  907,  896,  884,  880,  875,
         863,  852,  847,  838,  837,  833,  813,  810,  800,  796,  796,
         790,  789,  785,  784,  781,  766,  752,  740,  733,  726,  724,
         704,  704,  704,  696,  694,  683,  679,  678,  677,  660,  654,
         649,  644,  643,  632,  631,  627,  624,  608,  604,  600,  599,
         595,  591,  587,  581,  577,  571,  570,  561,  555,  555,  549,
         546,  535,  524,  521,  521, 

In [81]:
for top in range(60):
    get_topic_examples(top, 20, savepath='../streamlit/data/topics/reduction_60/examples/')

In [None]:
#Kopenhagen, 17. (5.) März. ellend,  London, 17 (5 ) März.

In [100]:
test_df = df[(df.doc_date.dt.year == 1886) & (df.doc_date.dt.month == 2) & (df.doc_date.dt.day == 13)]

In [9]:
def simple_segmentation(df, rz):
    
    for ix in tqdm(df.doc_id.unique()):
        
        article = df.loc[df.doc_id == ix]
        full_text = rz.loc[ix, 'full_text']
        
        msg_ids, placenames, starts, ends = list(article.index), list(article.placename), list(article.start), list(article.end)
        
        for msg_id, placename, end, start in zip(msg_ids, placenames, ends, starts[1:]+[len(full_text)]):
            
            yield {"msg_id": msg_id,
                   "doc_id": int(ix),
                   "placename": placename,
                   "text": full_text[end:start]}

In [11]:
df.loc[(df.doc_date.dt.year == 1856) & (df.doc_date.dt.month == 6) & (df.doc_date.dt.day == 11)]

Unnamed: 0,doc_id,doc_date,placename,day,day2,month,month2,origin_year,start,end,origin_date,delta,doc_year
95260,130633,1856-06-11,St. Petersburg,6,,6,,,26,52,1856-06-06,5,1856
95261,130634,1856-06-11,Kopenhagen,18,,6,,,11,34,1856-06-06,5,1856
95262,130635,1856-06-11,Berlin,19,,6,,,14,33,1856-06-07,4,1856
95263,130635,1856-06-11,Berlin,20,,6,,,798,816,1856-06-08,3,1856
95264,130635,1856-06-11,Wiesbaden,17,,6,,,1079,1100,1856-06-05,6,1856
95265,130635,1856-06-11,Darmstadt,17,,6,,,1374,1395,1856-06-05,6,1856
95266,130635,1856-06-11,Stuttgart,17,,6,,,1885,1906,1856-06-05,6,1856
95267,130635,1856-06-11,Stuttgart,18,,6,,,2046,2067,1856-06-06,5,1856
95268,130636,1856-06-11,Wien,17,,6,,,0,16,1856-06-05,6,1856
95269,130636,1856-06-11,Wien,19,,6,,,840,856,1856-06-07,4,1856


In [29]:
msg_id, doc_id = 95267, 130635

article = df.loc[df.doc_id == doc_id]

if len(article) == 1:
    span_start = article.start.values[0]
    span_end = -1
    
elif article.index[-1] == msg_id:
    print('here')
    span_start = article.loc[msg_id, 'start']
    span_end = -1
    
elif len(article) > 1:
    span_start = article.loc[msg_id, 'start']
    span_end = article.loc[msg_id+1, 'start']
    print(span_start, span_end)
    
print(rz.loc[doc_id, 'full_text'][span_start:span_end])

here
Stuttgart, 18. Juni. St.-A) Gestern Nachmittag 3 Uhr langten Ihre Majestät die verwittwete Kaiserin von Rußland und Se. Kaiserl. Hoheit der Großfürst Michael mit hohem Gefolge auf der Station Feuerbach, von Frankfurt kommend, mit einem Extrazuge an und begaben fich von dort aus zu Wagen in die Kronprinzltche Villa bei Berg. Se. Majestät der König waren Ihrer Majestät bis Ludwigsburg, Ihre Königl. Hoheiten der Kronprinz und die Kronprinzessin bis Bruchsal entgegengefahren. Abends 7 Uhr langten Se. Majestät der König von Preußen, empfangen von Sr. Majestät dem Könige, Ihren Königl. Hoheiten dem Kronprinzen und der Kronprinzesstu, Seiner Hoheit dem Prinzen vonSachsen-Weimar u. s. w.. im hiesigen Bahnhofe an und nahmen Ihr Absteigequartier im Königlichen Refidenzschlosse. Gestern fand im Königlichen Schlosse Familien-Souper statt. Heute wird große Tafel daselbst und nach Beendigung derselben Festvorstellung im Königl. Hoftheater stattfinden. Morgen ist große Tafel in der Wilhelms, nac

In [27]:
article

Unnamed: 0,doc_id,doc_date,placename,day,day2,month,month2,origin_year,start,end,origin_date,delta,doc_year
95262,130635,1856-06-11,Berlin,19,,6,,,14,33,1856-06-07,4,1856
95263,130635,1856-06-11,Berlin,20,,6,,,798,816,1856-06-08,3,1856
95264,130635,1856-06-11,Wiesbaden,17,,6,,,1079,1100,1856-06-05,6,1856
95265,130635,1856-06-11,Darmstadt,17,,6,,,1374,1395,1856-06-05,6,1856
95266,130635,1856-06-11,Stuttgart,17,,6,,,1885,1906,1856-06-05,6,1856
95267,130635,1856-06-11,Stuttgart,18,,6,,,2046,2067,1856-06-06,5,1856


In [12]:
def get_topic_examples2(top, n, savepath=None):
    
    examples = t2v.search_documents_by_topic(top, n, reduced=True)[2]
    
    msg_ids = [int(ex.split('_')[0]) for ex in examples]
    doc_ids = [int(ex.split('_')[1]) for ex in examples]
        
    example_texts = []
    
    for msg_id, doc_id in zip(msg_ids, doc_ids):
        article = df.loc[df.doc_id == doc_id]
        
        if len(article) == 1 or article.index[-1] == msg_id:
            span_start = article.start.values[0]
            span_end = -1

        elif len(article) > 1:
            span_start = article.loc[msg_id, 'start']
            span_end = article.loc[msg_id+1, 'start']
            print(span_start, span_end)
            
        example_texts.append(
                                {"msg_id": msg_id,
                                 "doc_id": doc_id,
                                 "date": rz.loc[doc_id, 'date'],
                                 "heading": rz.loc[doc_id, 'heading'],
                                 "text": rz.loc[doc_id, 'full_text'][span_start:span_end]
                                }
                            )
        
    if savepath:
        with open(savepath+f'\\examples_{top}.json', 'w', encoding='utf8') as f:
            json.dump(example_texts, f)
    else:
        return example_texts

In [117]:
for entry in get_topic_examples(0, 14):
    print(entry['text'])
    print('-------------------')

	Pest, 21. (9.) Januar. Der „Pester Lloyd" äußert sich heute in bemerkenswerther Weise über die orientalische Frage. Im Gegensatze zu Wien, wo man über jeden den Stab bricht, der nicht in der Türkei Alles gut und recht findet, wünscht man in Pest mit den Südslawischen Völkerschaften in gutem Einvernehmen zu bleiben. Das Wort Andrassy's, die Delegationsinstitution sei auf jeden anderen Staat anwendbar, der mit Ungarn in nähere Verbindung treten wolle, ließ eine speciellere Deutung auf den Südosten zu. Den Serbischen osficiösen Blättern erwidert der „Pester Lloyd": „In dem Augenblicke, als „Jedinstvo" es offen ausspricht, daß „auch Serbiens Interesse, ja sein Bedürfniß die Consolidirung aller Länder der Stephanskrone erheische", ist für das gegenseitige Einverständniß eine feste Basis gewonnen; es ist damit aber auch der wesentliche Unterschied zwischen der Serbischen und der Rumänischen Politik klar gekennzeichnet. Die Herren, die in Rumänien das große Wort führen, haben nicht nur ein s

In [107]:
list(simple_segmentation(test_df, rz))

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 285.20it/s]


[{'msg_id': 226712,
  'doc_id': 275983,
  'placename': 'St. Petersburg',
  'text': 'Durch das Urtheil aller in Rußland bestehende« artenbaugesellschaften, sowie einer Anzahl land-\n\n\twirthfchastlichen Gesellschaften und Specialisten aus dem Gebiete de» Gartenbaue» ist da» Factum erwiesen, daß die Gründung einer höheren Special-Lehranstalt für den Garten- und W e i n b o u in Rußland absolut nothwendig ist. DaS Domainenministerium nahm hieran» Beranlassung zu einer Umfrage welche Maßregel« etwa zu ergreifen wären, um den Gartenbau und die Weingärtnerei in Rußland auf eiue höhere Stufe zu heben. Die einlaufenden Antworten lautete» übereinstimmend dahin, daß eine Lehranstalt zu begründe« sei. welche wissenschaftlich gebildete Lehrer für die Garteybauschulen und Leiter der Krön»gartenanlagen heranbilden könne. Gegenwärtig ist denn auch ein Project, die Errichtung einer höheren Lehranstalt sür Garten- «nd Weinbau betreffend, ausgearbeitet worden, welches dem Reichsrath vor« gestellt werde