# This notebook aims at preparing the data for tableau

The intended visualization is a dual-axis bar and line chart. The bars represent the difference between consecutive five-year average Singapore temperatures, highlighting the fact that this difference tends to be positive—in other words, that Earth's temperature is consistently rising. The plot is inpsired by a famous plot of the GIEC (see project report)
 
The line chart displays the average temperature over half-decades. The 0°C reference is omitted, as the focus is on temperature evolution rather than absolute values.

## Importing the libraries

In [2]:
import json
from urllib.parse import unquote

## Importing data from `categories.tsv`

In [None]:
groups = ["B_H_1500H", "B_H", "H"]

def createBranch(groups):
    if len(groups) == 1:
        return {"name": groups.pop(), "size": 1}
    while len(groups) > 0:
        sub_dict = {"name": groups.pop(), "children": [createBranch(groups)]}
    
    return sub_dict

sub_dict = createBranch(groups)
print(sub_dict)

{'name': 'H', 'children': [{'name': 'B_H', 'children': [{'name': 'B_H_1500H', 'size': 1}]}]}


In [13]:
def getSubGroups():
    all_subjects = []
    with open('Dataset/Wikipedia/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/categories.tsv', 'r') as f:
        line = f.readlines()

    for l in line:
        elem = l.split('\t')
        entry = elem[0]
        entry = unquote(entry)

        if len(elem) > 1:
            subjects = elem[1].split('.')
            subjects_list = []
            for s in subjects[1:]:
                if s[-1] == '\n':
                    s = s[:-1]
                subjects_list.append(s)
            all_subjects.append(subjects_list)

    return all_subjects

all_subjects = getSubGroups()

In [9]:
all_subjects 

[['History',
  'British_History',
  'British_History_1500_and_before_including_Roman_Britain'],
 ['People', 'Historical_figures'],
 ['Countries'],
 ['Geography', 'European_Geography', 'European_Countries'],
 ['People', 'Artists'],
 ['Geography', 'European_Geography', 'European_Countries'],
 ['History',
  'British_History',
  'British_History_1500_and_before_including_Roman_Britain'],
 ['People', 'Historical_figures'],
 ['Business_Studies', 'Currency'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['History', 'Military_History_and_War'],
 ['History', 'General_history'],
 ['Science', 'Physics', 'Space_Astronomy'],
 ['Science', 'Physics', 'Space_Astronomy'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['History', 'General_history'],
 ['Everyday_life', 'Sports_events'],
 ['History', 'General_history'],
 ['Geography', 'Storms'],
 ['History', 'Recent

In [15]:
def insert_branch(root, branch):
    """ Insère une branche dans l'arbre """
    node = root
    for i, name in enumerate(branch):
        # Vérifier si le nœud existe déjà
        if "children" in node:
            found = next((child for child in node["children"] if child["name"] == name), None)
        
        if found:
            node = found
        else:
            # Si on est à la feuille, on ajoute "size"
            new_node = {"name": name}
            if i == len(branch) - 1:
                new_node["size"] = 1  # Peut être ajusté selon les besoins
            else:
                new_node["children"] = []
            node["children"].append(new_node)
            node = new_node

def build_tree(branches):
    """ Construit un arbre à partir d'une liste de branches """
    root = {"name": "root", "children": []}  # Racine virtuelle
    for branch in branches:
        insert_branch(root, branch)
    return root

# Exemple de données
branches = [
    ["History", "British_History", "British_History_1500_and_before_including_Roman_Britain"],
    ["History", "European_History", "French_Revolution"],
    ["History", "British_History", "Modern_British_History"]
]

# Construction de l'arbre
tree = build_tree(all_subjects)

# Suppression de la racine virtuelle si besoin
if len(tree["children"]) == 1:
    tree = tree["children"][0]

# Sauvegarde en JSON
with open("./D3Viz/netviz/src/assets/Tree.json", "w", encoding="utf-8") as f:
    json.dump(tree, f, indent=4)

print(json.dumps(tree, indent=4))

{
    "name": "root",
    "children": [
        {
            "name": "History",
            "children": [
                {
                    "name": "British_History",
                    "children": [
                        {
                            "name": "British_History_1500_and_before_including_Roman_Britain",
                            "size": 1
                        },
                        {
                            "name": "British_History_15001750",
                            "size": 1
                        },
                        {
                            "name": "British_History_17501900",
                            "size": 1
                        },
                        {
                            "name": "British_History_Post_1900",
                            "size": 1
                        }
                    ]
                },
                {
                    "name": "General_history",
                    "size": 1
       

In [None]:
import copy
all_subjects_copy = copy.deepcopy(all_subjects)

def compareBranchLength(b1, b2):
    return b1["size"] - b2["size"] > 0



def fuseDicts(d):
    nb_cat = 1
    id = d[0]["name"]
    cat = {id:{"name": id, "index": 0, "branch": copy.deepcopy(d[0])}}

    for i,elem in enumerate(d[1:]):
        if elem["name"] == id:
            for child in elem["children"]:
                d[0]["children"].append(child)
        
        elif elem["name"] not in cat.keys():
            nb_cat += 1
            cat[elem["name"]] = {"name": elem["name"], "index": i+1, "branch": copy.deepcopy(elem)}
        
        elif compareBranchLength(elem, cat[id]["branch"]):
            cat[id]["branch"] = elem
            cat[id]["index"] = i+1

    index_list = []
    for name in cat.keys():
        ind = cat[name]["index"]   
        index_list.append(ind)
        if "children" in d[ind]:
            for i,elem in enumerate(d[ind:]):
                if elem["name"] == name:
                    for child in elem["children"]:
                        d[ind]["children"].append(child)



    all_1 = [copy.deepcopy(d[i]) for i in index_list]
    return all_1

all_1 = fuseDicts(all_subjects_copy["children"])

MemoryError: 

In [94]:
all_1

[{'name': 'History',
  'children': [{'name': 'British_History',
    'children': [{'name': 'British_History_1500_and_before_including_Roman_Britain',
      'size': 1}]},
   [{'name': 'British_History',
     'children': [{'name': 'British_History_1500_and_before_including_Roman_Britain',
       'size': 1}]}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'Military_History_and_War', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'Recent_History', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'name': 'General_history', 'size': 1}],
   [{'na

In [100]:
for i in range(10):
    print(all_1[0]["children"][i])

{'name': 'British_History', 'children': [{'name': 'British_History_1500_and_before_including_Roman_Britain', 'size': 1}]}
[{'name': 'British_History', 'children': [{'name': 'British_History_1500_and_before_including_Roman_Britain', 'size': 1}]}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'Military_History_and_War', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
[{'name': 'General_history', 'size': 1}]
