# Testing HotVis on the wiki paths

In [1]:
import csv
from HotVisFunctions import *
import pandas as pd
import numpy as np
import torch_geometric

In [2]:

def split_path_data(data: pp.PathData, training_percentile: float):
    training_data = pp.PathData(data.mapping)
    test_data = pp.PathData(data.mapping)
    split_indices = torch.rand(data.num_paths)
    for i in range(data.num_paths):
        if split_indices[i] <= training_percentile:
            training_data.append_walk(data.get_walk(i))
        else:
            test_data.append_walk(data.get_walk(i))
    return training_data, test_data


In [6]:
def from_ngram(file: str, sep: str = ",") -> pp.PathData:
    with open(file, "r", encoding="utf-8") as f:
        paths = [line.strip().split(sep) for line in f if len(line.strip().split(sep)) > 1]
    
    weights = [1.0] * len(paths)

    mapping = pp.IndexMap()
    mapping.add_ids(np.concatenate([np.array(path) for path in paths]))

    pathdata = pp.PathData(mapping)
    pathdata.append_walks(node_seqs=paths, weights=weights)

    return pathdata

In [7]:
# load wiki
wiki = from_ngram("graphs/Wikipedia/paths_finished.ngram", sep=";")
print(wiki.get_walk(1))


('14th_century', 'Europe', 'Africa', 'Atlantic_slave_trade', 'African_slave_trade')


In [8]:
mo_model = pp.MultiOrderModel.from_PathData(wiki, 2)

In [None]:
#wiki, wiki = split_path_data(wiki, 0.7)

In [None]:
print(wiki.num_paths)
print(wiki.num_paths)
print(wiki.num_paths/wiki.num_paths)
print(wiki.num_paths/wiki.num_paths)

35996
15311
0.7015806809986941
0.29841931900130586


In [11]:
print(wiki.mapping)

%C3%85land -> 0
%C3%89douard_Manet -> 1
%C3%89ire -> 2
%E2%82%AC2_commemorative_coins -> 3
10th_century -> 4
11th_century -> 5
12th_century -> 6
13th_century -> 7
14th_century -> 8
15th_Marine_Expeditionary_Unit -> 9
15th_century -> 10
16_Cygni -> 11
16_Cygni_Bb -> 12
16th_century -> 13
1755_Lisbon_earthquake -> 14
17th_century -> 15
1896_Summer_Olympics -> 16
18th_century -> 17
1928_Okeechobee_Hurricane -> 18
1973_oil_crisis -> 19
1980_eruption_of_Mount_St._Helens -> 20
19th_century -> 21
1_Ceres -> 22
1st_century -> 23
1st_century_BC -> 24
2-6-0 -> 25
2-8-0 -> 26
2003_Atlantic_hurricane_season -> 27
2004_Atlantic_hurricane_season -> 28
2004_Indian_Ocean_earthquake -> 29
2005_Atlantic_hurricane_season -> 30
2005_Hertfordshire_Oil_Storage_Terminal_fire -> 31
2005_Kashmir_earthquake -> 32
2005_Sumatra_earthquake -> 33
20th_century -> 34
21st_century -> 35
2nd_century -> 36
3_Juno -> 37
3rd_century -> 38
4-2-0 -> 39
4-4-0 -> 40
4-6-0 -> 41
47_Ursae_Majoris -> 42
47_Ursae_Majoris_b -> 43


In [None]:
mo_model = pp.MultiOrderModel.from_PathData(wiki, 2)

ValueError: 'EdgeIndex' contains larger indices than its number of rows (got 4169, but expected values smaller than 4142)

In [None]:
layout_2 = HotVis(wiki_train, 2, 50000, 3, alpha=[1, 0.5], force=10)
layout_3 =  HotVis(wiki_train, 3, 50000, 3, alpha=[1, 0.5, 0.3], force=10)
layout_5 =  HotVis(wiki_train, 5, 50000, 3, alpha=[1, 0.5, 0.3, 0.25, 0.2], force=10)

ValueError: 'EdgeIndex' contains larger indices than its number of rows (got 4169, but expected values smaller than 4144)

In [None]:
print(edge_crossing(wiki, layout_2))
print(edge_crossing(wiki, layout_3))
print(edge_crossing(wiki, layout_5))

93
55
95


In [None]:
print(causal_path_dispersion(wiki, layout_2, 3))
print(causal_path_dispersion(wiki, layout_3, 3))
print(causal_path_dispersion(wiki, layout_5, 3))

tensor(6.0667, dtype=torch.float64)
tensor(5.7582, dtype=torch.float64)
tensor(5.3959, dtype=torch.float64)


In [None]:
print(closeness_eccentricity(wiki, layout_2, 3, 0.1))
print(closeness_eccentricity(wiki, layout_3, 3, 0.1))
print(closeness_eccentricity(wiki, layout_5, 3, 0.1))