In [1]:
!pip install  bs4 requests pandas lxml



In [2]:
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:

def load_taxanomy() -> dict:
    """ load ArXiv taxonomy from https://arxiv.org/category_taxonomy

    :return: groups, archives and categories
    :rtype: dict
    """

    response = requests.get('https://arxiv.org/category_taxonomy').text


    soup = BeautifulSoup(response, 'lxml')
    root = soup.find('div', {'id': 'category_taxonomy_list'})
    tags = root.find_all(["h2", "h3", "h4", "p"], recursive=True)

    level_1_name = ""
    level_2_code = ""
    level_2_name = ""

    level_1_names = []
    level_2_codes = []
    level_2_names = []
    level_3_codes = []
    level_3_names = []
    level_3_notes = []

    for tag in tags:
        if tag.name == "h2":
            level_1_name = tag.text
            level_2_code = tag.text
            level_2_name = tag.text
        elif tag.name == "h3":
            raw = tag.text
            level_2_code = re.sub(r"(.*)\((.*)\)", r"\2", raw)
            level_2_name = re.sub(r"(.*)\((.*)\)", r"\1", raw)
        elif tag.name == "h4":
            raw = tag.text
            level_3_code = re.sub(r"(.*) \((.*)\)", r"\1", raw)
            level_3_name = re.sub(r"(.*) \((.*)\)", r"\2", raw)
        elif tag.name == "p":
            notes = tag.text
            level_1_names.append(level_1_name)
            level_2_names.append(level_2_name)
            level_2_codes.append(level_2_code)
            level_3_names.append(level_3_name)
            level_3_codes.append(level_3_code)
            level_3_notes.append(notes)

    groups = []  # {name}
    archives = []  # {name, id, inGroup}
    categories = []  # {name, id, description, inArchive}

    group_names = list(set(level_1_names))
    for name in group_names:
        groups.append({"name": name})

    df_archives = pd.DataFrame({
        'inGroup': level_1_names,
        'name': level_2_names,
        'id': level_2_codes

    })
    df_archives.drop_duplicates(inplace=True, ignore_index=True)
    archives = df_archives.to_dict(orient="records")

    df_categories = pd.DataFrame({
        'inArchive': level_2_names,
        'name': level_3_names,
        'id': level_3_codes,
        'description': level_3_notes
    })
    df_categories.drop_duplicates(inplace=True, ignore_index=True)
    categories = df_categories.to_dict(orient="records")

    return {"groups": groups, "archives": archives, "categories": categories}

data = load_taxanomy()

  soup = BeautifulSoup(response, 'lxml')


# data example

```
  {
    title: 'parent 1',
    key: '0-0',
    children: [
      {
        title: 'parent 1-0',
        key: '0-0-0',
        children: [
          {
            title: 'leaf',
            key: '0-0-0-0',
```

In [19]:
import collections

tree_group_archive = collections.defaultdict(list)
for item in data["archives"]:
    tree_group_archive[item['inGroup']].append(item)
    
    
tree_archive_category = collections.defaultdict(list)
for item in data["categories"]:
    tree_archive_category[item['inArchive']].append(item)

tree_index = 0

tree_data_dict = {}
# build tree_group_archive
for group_name, archive_item_list in tree_group_archive.items():
    if group_name not in tree_data_dict:
        tree_data_dict[group_name] = {
            'title': group_name,
            'children': [],
            'key': str(tree_index),
            'level': 0,
        }
        tree_index+=1
        
    for archive_item in archive_item_list:
        
        archive_title = archive_item['name']
        
        archive_children = []
        for category_item in tree_archive_category[archive_item['name']]:
            archive_children.append({
                'title': category_item['name'],
                'id': category_item['id'],
                'description': category_item['description'],
                'key': str(tree_index),
                'isLeaf': True,
                'level': 2,
            })
            tree_index+=1
    
        tree_data_dict[group_name]['children'].append({
            'title': archive_title,
            'id': archive_item['id'],
            'children': archive_children,
            'key': str(tree_index),
            'level': 1,
        })
        tree_index+=1
    


import json
with open("../app/tree_data.json", "w") as f:
    json.dump(list(tree_data_dict.values()),f)