In [93]:
import os
import copy
from itertools import chain
import json
import pandas as pd
import plotly.graph_objects as go

In [332]:
reductions = [12, 36, 72]

In [333]:
hierarchy_df = pd.read_csv('../data/topics/topic_hierarchy.tsv', sep='\t', encoding='utf8')

In [334]:
def get_sizes(reductions):
    
    sizes = []
    
    for r in reductions:
        with open(f'../data/topics/reduction_{r}/sizes.json', 'r', encoding='utf8') as f:
            reduction_sizes = json.load(f)
            sizes += reduction_sizes
            
    return dict(zip(range(sum(reductions)), sizes))

In [335]:
sizes = get_sizes(reductions)

In [337]:
def get_labels(reduction):
    
    labels = []
    
    for r in reductions:
        with open(f'../data/topics/reduction_{r}/default_labels.json', 'r', encoding='utf8') as f:
            reduction_labels = json.load(f)
            labels += reduction_labels
            
    return dict(zip(range(sum(reductions)), labels))

In [338]:
labels = get_labels(reductions)

In [316]:
def get_hierarchies(reductions):

    all_hierarchies = []

    for r in reductions:
        with open(f'../data/topics/reduction_{str(r)}/reduction_hierarchy.json', 'r', encoding='utf8') as f:
            hierarchies = json.load(f)
            all_hierarchies.append(hierarchies)

    return all_hierarchies

In [340]:
hierarchies = get_hierarchies(reductions)

In [341]:
def create_hierarchy_df(hierarchies):
    
    levels = copy.deepcopy(hierarchies)
    nodes_flat = dict(enumerate(list(chain(*levels))))
    nodes = []
    
    # makes a list of a dictionary for each level with unique id for each topic
    for level in levels:
        nodes.append(dict(list(nodes_flat.items())[len(list(chain(*nodes))):len(list(chain(*nodes)))+len(level)]))
        
    paths = [[key] for key in nodes[-1].keys()]
    #print(len(paths))

    # starts from the penultimate level and looks if the topic is a subset of anything in that level
    for path in paths:
        for level in nodes[-2::-1]:
            last_subtopic = set(nodes[len(nodes)-len(path)][path[-1]])
            for root_key, root_topic in level.items():
                if last_subtopic.issubset(set(root_topic)):
                    path.append(root_key)
    
    paths = [path[::-1] for path in paths]
    
    #return paths
    
    hierarchy_df = pd.DataFrame(paths)#.apply(lambda x: x.sort_values().values)
    hierarchy_df.columns = [f'reduction_{len(level)}' for level in nodes]
    
    return hierarchy_df

In [342]:
hierarchy_df = create_hierarchy_df(hierarchies)

In [375]:
def create_sankey_link(hierarchy_df, sizes, labels):
    
    
    
   # node = dict(
   #   pad = 15,
   #   thickness = 20,
   #   line = dict(color = "black", width = 0.5),
   #   label = ["A1", "A2", "B1", "B2", "C1", "C2"],
   #   color = "blue"
   # 
    
    
    source = []
    target = []
    value  = []
    
    for source_col, target_col in zip(hierarchy_df.columns[0:-1], hierarchy_df.columns[1:]):
        #print('\n', size_list.keys())
        unique_combinations = hierarchy_df[[source_col, target_col]].drop_duplicates()
        sources = list(unique_combinations[source_col])
        targets = list(unique_combinations[target_col])
        
        source += sources
        target += targets
        
    value = [sizes[top] for top in target]
    
    assert len(source) == len(target) == len(value)
        
    link = dict(source=source, target=target, value=value)
    
    return link

In [376]:
link = create_sankey_link(hierarchy_df, sizes, labels)
node = dict(label=[str(key)+'_'+value for key, value in labels.items()],
            pad=5,
            thickness=100)
sankey_data = go.Sankey(link=link, node=node)

fig = go.Figure(sankey_data)

In [377]:
hierarchy_df[['reduction_12', 'reduction_36']].drop_duplicates().sort_values(by='reduction_12')

Unnamed: 0,reduction_12,reduction_36
56,0,36
2,0,24
6,0,19
8,0,13
10,0,39
20,0,43
38,1,29
34,1,44
33,1,31
7,1,23


In [378]:
fig.update_layout(
    autosize=False,
    width=1000,
    height=1500)
fig.show()

In [395]:
sizes[119]

1917

In [392]:
labels[12]

'kronprinzessin-herrschaften-großherzogin-schlosse-heiterer'

In [393]:
sizes[12]

9497

In [389]:
print(labels[12])
print(labels[21])
print(sizes[12])
print(sizes[21])
print(sum([sizes[12], sizes[21]]))

kronprinzessin-herrschaften-großherzogin-schlosse-heiterer
ischl-schonbrunn-metternich-hofburg-erzherzoge
9497
7894
17391


In [361]:
for root_top in hierarchy_df['reduction_12']:
    subtopics = hierarchy_df.loc[(hierarchy_df.reduction_12 == root_top), 'reduction_36'].values
    print(root_top, ': ', subtopics)
    print(sum(sizes[top] for top in subtopics) == sizes[root_top])

3 :  [12 12 21 21 12 12 12]
False
2 :  [15 38 38]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
4 :  [25 27 37 27]
False
7 :  [18 14 14 18 14]
False
7 :  [18 14 14 18 14]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
1 :  [23 26 35 31 44 31 29 26 44 29 29]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
9 :  [20 20 20]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
4 :  [25 27 37 27]
False
11 :  [28 40 46]
False
1 :  [23 26 35 31 44 31 29 26 44 29 29]
False
11 :  [28 40 46]
False
4 :  [25 27 37 27]
False
8 :  [17 17 22 22 22]
False
6 :  [42 30 33 45 33 30 45 30]
False
8 :  [17 17 22 22 22]
False
5 :  [16 41 16 34 16 34 16 16 16]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
1 :  [23 26 35 31 44 31 29 26 44 29 29]
False
3 :  [12 12 21 21 12 12 12]
False
6 :  [42 30 33 45 33 30 45 30]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
0 :  [24 19 13 39 43 13 13 19 36 36]
False
8 :  [17 17 22 22 22]
False
11 :  [28 40 46]
False
5 :  [16 41 16 34 16 34 16 16 16]
False
5 :  [16 41

In [351]:
hierarchy_df.loc[hierarchy_df.reduction_36==12]

Unnamed: 0,reduction_12,reduction_36,reduction_72
0,3,12,48
22,3,12,70
41,3,12,89
51,3,12,99
55,3,12,103


In [328]:
sum([sizes[str(i)] for i in hierarchy_df.loc[hierarchy_df.reduction_12==1, 'reduction_36'].values])

66685

In [329]:
sizes[str(1)]

26212

In [68]:
with open('../data/topics/reduction_12/reduction_hierarchy.json', 'r', encoding='utf8') as f:
    hierarchy_12 = json.load(f)
    
with open('../data/topics/reduction_36/reduction_hierarchy.json', 'r', encoding='utf8') as f:
    hierarchy_36 = json.load(f)

In [60]:
len(link['value'])

48

In [8]:
t2v = Top2Vec.load('../data/models/t2v_211122_100_deep.pkl')

In [None]:
t2v.hierarchical_topic_reduction

In [4]:
def get_labels(t2v, n):
    
    labels = []
    
    for top_words in t2v.get_topics(reduced=True)[0]:
        labels.append('-'.join(top_words[:n]))
        
    return labels

In [5]:
def get_hierarchies_and_labels(t2v, reductions):
    
    hierarchies = []
    labels = []
    
    for r in reductions:
        print(f'Reducing to {r}')
        t2v.hierarchical_topic_reduction(r)
        hierarchies.append(t2v.get_topic_hierarchy())
        labels.append(get_labels(t2v, 3))
        
    return hierarchies, labels

In [7]:
#hierarchies, labels = get_hierarchies_and_labels(t2v, [10, 30, 50])

In [3]:
with open('../temp/topic_hierarchy_5.json' ,'r') as f:
    hierarchy_5 = json.load(f)
    
with open('../temp/topic_hierarchy_20.json' ,'r') as f:
    hierarchy_20 = json.load(f)
    
with open('../temp/topic_hierarchy_50.json' ,'r') as f:
    hierarchy_50 = json.load(f)

In [4]:
hierarchy = [hierarchy_5, hierarchy_20, hierarchy_50]

In [24]:
# viimasele kihile id
# kui eelmine kiht sisaldab kõiki, siis on juur
# dataframe

In [46]:
def create_hierarchy_df(t2v, hierarchies, labels=None, colors=None):
    
    levels = copy.deepcopy(hierarchies)
    topic_dict = {}
    sizes = {}
    
    nodes_flat = dict(enumerate(list(chain(*levels))))
    nodes = []
    
    # makes a list of a dictionary for each level with unique id for each topic
    for level in levels:
        nodes.append(dict(list(nodes_flat.items())[len(list(chain(*nodes))):len(list(chain(*nodes)))+len(level)]))
        
    paths = [[key] for key in nodes[-1].keys()]

    # starts from the penultimate level and looks if the topic is a subset of anything in that level
    for path in paths:
        for level in nodes[-2::-1]:
            last_subtopic = set(nodes[len(nodes)-len(path)][path[-1]])
            for root_key, root_topic in level.items():
                if last_subtopic.issubset(set(root_topic)):
                    path.append(root_key)
            
    paths = [path[::-1] for path in paths]
    
    link_df = pd.DataFrame(paths).apply(lambda x: x.sort_values().values)
    link_df.columns = [f'reduction_{len(level)}' for level in nodes]
    
    return link_df
        
            
link_df = create_link_df(t2v, hierarchy)

In [51]:
link_df.to_csv('../temp/link_df.tsv', sep='\t', encoding='utf8', index=False)

In [52]:
loaded_link = pd.read_csv('../temp/link_df.tsv', sep='\t', encoding='utf8')

In [55]:
reductions = [12, 36, 72]

In [68]:
{top+previous_reduction: '-'.join(top_words[:3]) for top, top_words in enumerate(list(t2v.get_topics()[0][:10]))}

{12: 'herrschaften-herrschasten-hoheiten',
 13: 'madrider-concha-serrano',
 14: 'pforte-machte-psorte',
 15: 'tagesbesehls-tagesbefehls-schwerdtern',
 16: 'unterhause-unterhaus-bill',
 17: 'ortlichen-nowosti-birsh',
 18: 'fortschrittler-culturkampf-freifinnigen',
 19: 'cardinalen-pius-papst',
 20: 'bonapartisten-gambetta-fourtou',
 21: 'lesung-gesetzentwurfs-amendements'}

In [66]:
{top+previous_reduction: int(size) for top, size in enumerate(list(t2v.get_topic_sizes()[0][:10]))}

{12: 4700,
 13: 4609,
 14: 3607,
 15: 3448,
 16: 3361,
 17: 3294,
 18: 3140,
 19: 3020,
 20: 2944,
 21: 2909}

In [62]:
labels = {}
previous_reduction = 12
    
for top, top_words in enumerate(t2v.get_topics()[0]):
    labels[top+previous_reduction] = '-'.join(top_words[:3])
    
json.dumps(labels)

'{"12": "herrschaften-herrschasten-hoheiten", "13": "madrider-concha-serrano", "14": "pforte-machte-psorte", "15": "tagesbesehls-tagesbefehls-schwerdtern", "16": "unterhause-unterhaus-bill", "17": "ortlichen-nowosti-birsh", "18": "fortschrittler-culturkampf-freifinnigen", "19": "cardinalen-pius-papst", "20": "bonapartisten-gambetta-fourtou", "21": "lesung-gesetzentwurfs-amendements", "22": "corvetten-kanonenboten-nargen", "23": "zollvereins-zolleinigung-darmstadter", "24": "mehmed-syrien-egypt", "25": "buhne-kunstler-kunst", "26": "verurtheilte-freigesprochen-verurtheilt", "27": "eidgenossischen-waadt-kantonen", "28": "kabul-kandahar-afghanistan", "29": "tumultuanten-ruhestorer-volkshaufen", "30": "eupatoria-kamiesch-galacz", "31": "vaterlande-vaterlandes-vorsehung", "32": "france-tuilerien-pays", "33": "tennessee-sherman-grant", "34": "pistol-morder-pistole", "35": "croatischen-ungarn-leitha", "36": "boulevards-tuilerieen-notre", "37": "palermo-calabrien-catania", "38": "piraus-patras

In [33]:
with open('../streamlit/data/topics/reduction_15/sizes.json', 'r', encoding='utf8') as f:
    sizes_15 = json.load(f)
    
with open('../streamlit/data/topics/reduction_30/sizes.json', 'r', encoding='utf8') as f:
    sizes_30 = json.load(f)
    
with open('../streamlit/data/topics/reduction_60/sizes.json', 'r', encoding='utf8') as f:
    sizes_60 = json.load(f)

In [23]:
link[['reduction_5', 'reduction_20']].drop_duplicates()

Unnamed: 0,reduction_5,reduction_20
0,0,5
3,0,6
7,0,7
10,0,8
12,1,9
15,1,10
20,1,11
22,1,12
23,2,12
24,2,13


In [19]:
link[['reduction_5', 'reduction_20']].value_counts().reset_index(name='count').apply(lambda x: x.sort_values().values)

Unnamed: 0,reduction_5,reduction_20,count
0,0,5,1
1,0,6,1
2,0,7,1
3,0,8,1
4,1,9,2
5,1,10,2
6,1,11,2
7,1,12,2
8,2,12,2
9,2,13,2


In [15]:
list(zip(link.columns[0:-1], link.columns[1:], [1,2,3]))

[('reduction_5', 'reduction_20', 1), ('reduction_20', 'reduction_50', 2)]

In [None]:
fig = go.sankey(data=)

In [13]:
list(link.reduction_5)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4]