In [None]:
import os
import numpy as np
import requests
import re
import json
from bs4 import BeautifulSoup

def get_last_part(url):
    return url.rstrip('/').split('/')[-1]

def get_tname(url):
    return "".join(url.rstrip('/').split('/')[-2:])

def filter_all_subsections(section_text):
    lines = section_text.split('\n')  # Split the content into lines
    subsections = []  # To store all subsections

    # This will hold the current subsection we're collecting
    current_subsection = []

    for line in lines:
        # Skip empty lines
        if line.strip() == "":
            continue
        
        # Check if it's a new subsection (first line is non-empty)
        if not current_subsection:
            current_subsection.append(line.strip())  # Add the first non-empty line of subsection
        elif '-' in line and line.strip() != "":
            # Add lines that contain "-" to the current subsection
            current_subsection.append(line.strip())
        else:
            # Once we encounter a non-dash line, store the current subsection and start a new one
            subsections.append("\n".join(current_subsection))
            current_subsection = [line.strip()]  # Start a new subsection with the next valid line

    # Don't forget to add the last collected subsection
    if current_subsection:
        subsections.append("\n".join(current_subsection))

    return subsections

def get_section_by_heading(url, keyword):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find the first heading with the specified keyword (mappool or mappools)
    heading = soup.find(
        lambda tag: tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and 
                    re.search(r"\bmappools?\b", tag.get_text(), re.IGNORECASE)
    )
    
    if not heading:
        return None  # No heading found with "mappool" or "mappools"

    # Get the level of the found heading (e.g., h2 => level 2, h3 => level 3)
    heading_level = int(heading.name[1])  # h2 -> 2, h3 -> 3, etc.
    content = []
    for sibling in heading.find_next_siblings():
        # Stop collecting when we encounter a heading of the same level or higher
        if sibling.name == heading.name:
            break
        if sibling.get_text():
            content.append(sibling.get_text())

    return "\n".join(content)

def get_slugs(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Get all visible text
    page_text = soup.get_text(separator="\n", strip=True)


    # Print all text
    #print("PAGE TEXT:")
    #print(page_text)
    
    # Get all hyperlinks (anchor tags)
    links = []
    for a_tag in soup.find_all("a", href=True):
        text = a_tag.get_text(strip=True)
        href = a_tag["href"]
        links.append((text, href))

    filtered_links = []
    for text, href in links:
        if pattern.match(href):
            last_segment = get_last_part(href)
            filtered_links.append((text, href, last_segment))

    # Tester output
    #for text, href, slug in filtered_links:
    #    print(f"Text: {text} | URL: {href} | ID/Slug: {slug}")
    
    slugs = []
    for text, href, slug in filtered_links:
        slugs.append(slug)
    return slugs

def generate_subsection_dict_n(subsection):
    lines = subsection.split('\n')  # Split the subsection into lines
    subsection_dict = {}

    # If the subsection has more than one line, start processing
    if len(lines) > 1:
        base_line = lines[0].strip()  # The first line (this will be the key base)
        # Add a counter to the first line
        for i, line in enumerate(lines[1:], start=1):  # Start from second line (index 1)
            subsection_dict[f"{base_line}{i}"] = {"meta": line.strip()}

    return subsection_dict

def generate_subsection_dict_slugs(subsection, slugs):
    lines = subsection.split('\n')  # Split the subsection into lines
    subsection_dict = {}

    # If the subsection has more than one line, start processing
    if len(lines) > 1:
        base_line = lines[0].strip()  # The first line (this will be the key base)
        # Add a counter to the first line
        for i, line in enumerate(lines[1:], start=1):  # Start from second line (index 1)
            subsection_dict[f"{base_line}{i}"] = {"id": slugs[i-1]}
    return subsection_dict

def make_tdic(filtered_sections, slugs, tname):
    stages = []
    stage_mods = []
    dicts = []
    dict_id = []
    current_stage_counter = 0

    slut_ctr = 0
    for subs in filtered_sections:
        if "\n" not in subs:
            if "Download" in subs or "VOD" in subs or "played" in subs or "This" in subs:
                pass
            else:
                stage_mods.append(current_stage_counter)
                current_stage_counter = 0
                stages.append(subs)
                
        else:
            b4 = slut_ctr
            temp = generate_subsection_dict_n(subs)
            slut_ctr += subs.count("\n")
            fuck = generate_subsection_dict_slugs(subs, slugs[b4:slut_ctr])
            current_stage_counter += 1
            dicts.append(temp)
            dict_id.append(fuck)

    stage_mods.append(current_stage_counter)
    stages.append(subs)

    del stage_mods[0]
    del stages[-1]

    tdic_names = {}
    tdic_slugs = {}
    stage_mods_cum = np.cumsum(stage_mods).tolist()
    stage_mods_cum.insert(0, 0)
    for i in range(len(stage_mods_cum)-1):
        tdic_names[stages[i]] = dicts[stage_mods_cum[i]:stage_mods_cum[i+1]][0]
        tdic_slugs[stages[i]] = dict_id[stage_mods_cum[i]:stage_mods_cum[i+1]][0]
    dic_n, dic_s = {}, {}
    dic_n[tname] = tdic_names
    dic_s[tname] = tdic_slugs
    return dic_n, dic_s

def add_slugs_to_dict(tname, slugs):
    """
    Adds slugs to a nested dictionary under the given tname and id.

    Args:
        data (dict): The dictionary to update.
        tname (str): The top-level key (e.g., tournament name).
        id (str): The ID key under the tname.
        slugs (list): A list of slugs to add.
    """
    data = {}
    data[tname] = {}
    data[tname]["id"] = []
    
    # Add slugs without duplicates
    for slug in slugs:
        data[tname]["id"].append(slug)
    return data

pattern = re.compile(r"^https://osu\.ppy\.sh/(beatmapsets|b)/")

def main(url):
    # need to add null slugs when a map is missing a link.
    section_text = get_section_by_heading(url, "mappool")
    filtered_sections = filter_all_subsections(section_text)
    VAFslugs = get_slugs(url)
    tname = get_tname(url)
    #tdic_n, tdic_s = make_tdic(filtered_sections, VAFslugs, tname) # this would be ideal but it fucking fails when missing links exist
    tdic_s = add_slugs_to_dict(tname, VAFslugs)
    print(tdic_s)
    fn = "./dbs/"+tname+".json"
    with open(fn, "w") as json_file:
        json.dump(tdic_s, json_file)

with open ("urls.txt", "r") as f:
    urls = f.readlines()
    urls = [line.strip() for line in urls]

for url in urls:
    try: 
        main(url)
    except AttributeError:
        pass

'''
for url in urls:
    try:
        main(url)
    except (IndexError, AttributeError) as e:
        print("Error url: {}".format(url))
'''
        





{'TWC2011': {'id': ['99165', '87775', '58061', '72404', '88821', '88245', '61917', '39076', '51845', '75184', '94475', '58361', '45895', '80425', '78390', '69725', '72946', '89363', '42796', '79083', '81344', '50430', '70216', '93176', '86969', '75926', '70334', '90616', '79035', '80827', '76993', '76530', '60646', '67202', '77710', '46300', '49077', '47462', '77232', '73769', '78034', '69595', '86199', '91077', '90170', '84587', '50461', '89450', '78760', '48999', '84803', '80556', '87507', '28306', '53245', '67051', '62760', '60790', '58041', '49881', '77928', '76450', '62590', '79281', '49322', '71514', '60189', '76510', '82529', '67469', '59002', '88920', '33963', '41044', '48258', '74498', '72069', '59792', '73057', '41909', '74478', '66330', '62290', '69385', '37370', '48075', '53310', '70659', '56732', '86669', '38237', '70966', '59424', '74057', '78220', '44998', '70624', '75251']}}
{'TWC2012': {'id': ['93176', '77232', '103052', '99749', '88821', '88245', '89363', '102527', '6

'\n\n# to merge all the json files\n\njson_files = glob.glob("path/to/jsons/*.json")  # Adjust this to match your files\n\nmerged_data = {}\n\nfor file_path in json_files:\n    with open(file_path, "r") as f:\n        data = json.load(f)\n        merged_data.update(data)  # Adds the single top-level key\n\n# Save the merged result\nwith open("merged.json", "w") as f:\n    json.dump(merged_data, f, indent=2)\n\n'

In [4]:
import glob
# to merge all the json files

json_files = glob.glob("./dbs/*.json")  # Adjust this to match your files

merged_data = {}

for file_path in json_files:
    with open(file_path, "r") as f:
        data = json.load(f)
        merged_data.update(data)  # Adds the single top-level key

# Save the merged result
with open("./data/database.json", "w") as f:
    json.dump(merged_data, f)
