# This notebook aims at preparing the data for the Hive Plot visualisation

## Importing the libraries

In [None]:
import json
from urllib.parse import unquote

## Retrieving all Wikipedia's articles 

Wikipedia's articles are in the file articles.tsv, every row contains an URL encoded article

In [2]:
with open('Wikipedia/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/articles.tsv', 'r') as f:
    entries = f.readlines()

all_entries = []

for i in range(len(entries)):
    entry = unquote(entries[i])
    entry = entry.split('\n')[0]
    if entry not in all_entries:
        all_entries.append(entry)

print(all_entries[:10])

['Áedán_mac_Gabráin', 'Åland', 'Édouard_Manet', 'Éire', 'Óengus_I_of_the_Picts', '€2_commemorative_coins', '10th_century', '11th_century', '12th_century', '13th_century']


## Computing the radius of every node and links between them

The file `paths_finished.tsv` contains starting and ending articles for every completed wikispeedia game, articles that lead to the final article are seperated by semicolon ";", thus, they are linked together. Sometimes, player went back to the previous link, which is denoted as "<", those links are ignored

In [3]:
with open('Wikipedia/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_finished.tsv', 'r') as f:
    path = f.readlines()

data_json = {"nodes": [], "links": []}

for i in range(len(all_entries)):
    data_json["nodes"].append({"id": all_entries[i], "group": [], "sub_group": '', "radius": 5})

data_json["nodes"] = {node['id']: node for node in data_json["nodes"]}

for i in range(len(path)):
    chemin = path[i].split('\t')
    entries_in_path = chemin[3].split(";")
    entries_in_path = [unquote(e) for e in entries_in_path]
    for i in range(len(entries_in_path)-1):
        if entries_in_path[i] != "<" and entries_in_path[i+1] != "<":
            data_json["nodes"][entries_in_path[i]]["radius"] +=1
            data_json["links"].append({"source": entries_in_path[i], "target": entries_in_path[i+1], "sub_source": '', "sub_taget" : ''})

## Retrieving the sub-groups of every groups  

In [4]:
def getSubGroups():
    all_subjects = {}
    association = {}
    for i in range(len(all_entries)):
        association[all_entries[i]] = []
    with open('Wikipedia/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/categories.tsv', 'r') as f:
        line = f.readlines()

    for l in line:
        elem = l.split('\t')
        entry = elem[0]
        entry = unquote(entry)

        if len(elem) > 1:
            subject = elem[1].split('.')
            if len(subject) >= 3:
                if subject[1] not in all_subjects:
                    all_subjects[subject[1]] = []
                cat = subject[2]

                if cat[-1] == '\n':
                    cat = cat[:-1]
                if cat not in all_subjects[subject[1]]:
                    all_subjects[subject[1]].append(cat)
                association[entry].append(cat)
    return association, all_subjects

association, all_subjects = getSubGroups()
print(all_subjects)
print(association)

{'History': ['British_History', 'General_history', 'Military_History_and_War', 'Recent_History', 'Ancient_History_Classical_History_and_Mythology', 'Pre_1900_Military', 'World_War_II', 'Archaeology', 'Historians_chroniclers_and_history_books', 'North_American_History'], 'People': ['Historical_figures', 'Artists', 'Sports_and_games_people', 'Geographers_and_explorers', 'Writers_and_critics', 'USA_Presidents', 'Religious_figures_and_leaders', 'Political_People', 'Producers_directors_and_media_figures', 'Mathematicians', 'Astronomers_and_physicists', 'Engineers_and_inventors', 'Military_People', 'Chemists', 'Monarchs_of_Great_Britain', 'Actors_models_and_celebrities', 'Performers_and_composers', 'Philosophers', 'Human_Scientists', 'Computing_People'], 'Geography': ['European_Geography', 'Storms', 'Geology_and_geophysics', 'Natural_Disasters', 'Geography_of_the_Middle_East', 'Geography_of_Great_Britain', 'African_Geography', 'Geography_of_Asia', 'General_Geography', 'Geography_of_Oceania_A

In [5]:
def limitRadius(rad):
    if rad >= 15:
        return 15
    if rad <= 5:
        return 5

## Computing the radius of nodes and links between nodes

nodes are sub-groups, they will be placed on axes corresponding to the group they belong to, the radius of the node is porportional to the number of link a node make with other nodes

In [6]:
import math

data_json_sub = {"nodes": [], "links": []}
for key in all_subjects:
    for sub in all_subjects[key]:
        data_json_sub["nodes"].append({"id": sub, "group": key, "radius": 5})

data_json_sub["nodes"] = {node['id']: node for node in data_json_sub["nodes"]}

for e in data_json["nodes"]:
    for sub in association[e]:
        data_json_sub["nodes"][sub]["radius"] += 1

for e in data_json_sub["nodes"]:
    data_json_sub["nodes"][e]["radius"] = int(math.sqrt(data_json_sub["nodes"][e]["radius"]))


can_append = True
for e in data_json["links"]:
    source = e["source"]
    target = e["target"]
    source_sub = association[source]
    target_sub = association[target]
    for s in source_sub:
        for t in target_sub:
            if s != t:
                for e in all_subjects:
                    if s in all_subjects[e] and t in all_subjects[e]:
                        can_append = False
                        break
                if can_append:
                    data_json_sub["links"].append({"source": s, "target": t})
                can_append = True
            

print(data_json_sub["nodes"])
print(data_json_sub["links"][:10])

{'British_History': {'id': 'British_History', 'group': 'History', 'radius': 12}, 'General_history': {'id': 'General_history', 'group': 'History', 'radius': 9}, 'Military_History_and_War': {'id': 'Military_History_and_War', 'group': 'History', 'radius': 8}, 'Recent_History': {'id': 'Recent_History', 'group': 'History', 'radius': 7}, 'Ancient_History_Classical_History_and_Mythology': {'id': 'Ancient_History_Classical_History_and_Mythology', 'group': 'History', 'radius': 9}, 'Pre_1900_Military': {'id': 'Pre_1900_Military', 'group': 'History', 'radius': 5}, 'World_War_II': {'id': 'World_War_II', 'group': 'History', 'radius': 6}, 'Archaeology': {'id': 'Archaeology', 'group': 'History', 'radius': 4}, 'Historians_chroniclers_and_history_books': {'id': 'Historians_chroniclers_and_history_books', 'group': 'History', 'radius': 3}, 'North_American_History': {'id': 'North_American_History', 'group': 'History', 'radius': 5}, 'Historical_figures': {'id': 'Historical_figures', 'group': 'People', 'rad

## Computing the value of links

the value of link correspond to the number of "sub-link" from one node to another. Sub-group contain a lot of articles that are linked to other articles contained in other nodes.

In [7]:
data_json_sub2 = {"nodes": [], "links": [{}]}
data_json_sub2["nodes"] = data_json_sub["nodes"]
keys = []

for e in data_json_sub["links"]:
    source = e["source"]
    target = e["target"]
    k = source + "-" + target
    if k not in keys:
        keys.append(k)

dict_links = {}
for k in keys:
    source, target = k.split("-")
    dict_links[k] = {"id": k, "source": source, "target": target, "value": 0}

data_json_sub2["links"] = dict_links

for e in data_json_sub["links"]:
    source = e["source"]
    target = e["target"]
    k = source + "-" + target
    data_json_sub2["links"][k]["value"] += 1

print(data_json_sub2["links"])

{'General_history-General_Geography': {'id': 'General_history-General_Geography', 'source': 'General_history', 'target': 'General_Geography', 'value': 46}, 'African_Geography-British_History': {'id': 'African_Geography-British_History', 'source': 'African_Geography', 'target': 'British_History', 'value': 123}, 'General_history-European_Geography': {'id': 'General_history-European_Geography', 'source': 'General_history', 'target': 'European_Geography', 'value': 469}, 'General_history-African_Geography': {'id': 'General_history-African_Geography', 'source': 'General_history', 'target': 'African_Geography', 'value': 84}, 'British_History-Animal_and_Human_Rights': {'id': 'British_History-Animal_and_Human_Rights', 'source': 'British_History', 'target': 'Animal_and_Human_Rights', 'value': 32}, 'Animal_and_Human_Rights-African_Geography': {'id': 'Animal_and_Human_Rights-African_Geography', 'source': 'Animal_and_Human_Rights', 'target': 'African_Geography', 'value': 51}, 'Ancient_History_Class

## Processing the links so that they are not too large

In [10]:
def limitLinks(value):
    if value > 10:
        return 10
    return value

for e in data_json_sub2["links"]:
    data_json_sub2["links"][e]["value"] = limitLinks(data_json_sub2["links"][e]["value"])

In [11]:
data_json_sub3 = {"nodes": [], "links": []}

for e in data_json_sub2["links"]:
    data_json_sub3["links"].append(data_json_sub2["links"][e])

for e in data_json_sub2["nodes"]:
    data_json_sub3["nodes"].append(data_json_sub2["nodes"][e])

print(data_json_sub3["links"][:10])
print(data_json_sub3["nodes"][:10])

[{'id': 'General_history-General_Geography', 'source': 'General_history', 'target': 'General_Geography', 'value': 10}, {'id': 'African_Geography-British_History', 'source': 'African_Geography', 'target': 'British_History', 'value': 10}, {'id': 'General_history-European_Geography', 'source': 'General_history', 'target': 'European_Geography', 'value': 10}, {'id': 'General_history-African_Geography', 'source': 'General_history', 'target': 'African_Geography', 'value': 10}, {'id': 'British_History-Animal_and_Human_Rights', 'source': 'British_History', 'target': 'Animal_and_Human_Rights', 'value': 10}, {'id': 'Animal_and_Human_Rights-African_Geography', 'source': 'Animal_and_Human_Rights', 'target': 'African_Geography', 'value': 10}, {'id': 'Ancient_History_Classical_History_and_Mythology-European_Geography', 'source': 'Ancient_History_Classical_History_and_Mythology', 'target': 'European_Geography', 'value': 10}, {'id': 'European_Geography-Religious_movements_traditions_and_organizations',

In [None]:
with open(f"D3Viz/netviz/src/assets/Hive.json", "w") as outfile: 
    json.dump(data_json_sub3, outfile)