# Osu Dataset Maker

**How To Use**


*   Make 'osu_dataset' folder.
*   Put .osz files downloaded from osu beatmap website into the folder.
*   Run the code.




**File Name Format**


*   {setId}-{star_rating}-{key_count}.osu

*   {setId}.mp3
  *  {setId} : beatmap set id (can search in osu beatmap website by this id)
  *  {key_count} : number of using keys
  *  {star_rating} : osu ★ difficulty
 
*  ex) 443632-163-4.osu 
 *  ID 443632
 *  ★ 1.63
 *  4 key 

※ Each songs can have more than 1 beatmap.
(different difficulty or key)


In [1]:
# import libraries

import os
import zipfile
import re
import requests
import time
import http.client
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from utils import move_file

In [3]:
# Test: rename all ZIP to OSU

zips = Path('osu_dataset').glob('*.zip')
for zip in zips:
    zip.rename(zip.with_suffix('.osz'))

In [6]:
'''
Functions for
1. unzip .osz files
2. rename .osu .mp3 files
'''

def unzip_osz(osz_fn_list, output_path:Path, star_rating_upper_bound=400):
  excluded_path = output_path / 'excluded/'
  excluded_path.mkdir(exist_ok=True)

  zip_path = output_path / 'zip/'
  zip_path.mkdir(exist_ok=True)

  skipped_path = output_path / 'skipped'
  skipped_path.mkdir(exist_ok=True)

  for fn in tqdm(osz_fn_list):
    # Change .osz to .zip
    zip_fn = fn.with_suffix('.zip')
    fn.rename(zip_fn)

    # Unzip .zip
    unzipped_audio_fns = []
    unzipped_osu_fns = []
    with zipfile.ZipFile(zip_fn, 'r') as zip_ref:
      for file in zip_ref.namelist():
        if file.endswith('.mp3') or file.endswith('.ogg') or file.endswith('.wav') or file.endswith('.MP3') or file.endswith('.WAV') or file.endswith('.OGG'):
          zip_ref.extract(file, output_path)
          unzipped_audio_fns.append(output_path / file)
        elif file.endswith('.osu'):
          zip_ref.extract(file, output_path)
          unzipped_osu_fns.append(output_path / file)
    move_file(zip_fn, zip_path / zip_fn.name)
    
    audio_fn, _, setId, _ = get_info(unzipped_osu_fns[0])
    audio_fn = output_path / audio_fn
    for f in unzipped_audio_fns:
      if f != audio_fn:
        f.unlink()

    new_audio_fn = output_path / (setId + audio_fn.suffix)
    move_file(audio_fn, new_audio_fn)

    for osu_fn in unzipped_osu_fns:
      audio_fn, mode, setId, id = get_info(osu_fn)
      if mode != '3':
        osu_fn.unlink()
      else:
        star_rating, key_count = crawl(setId, id)
        if star_rating == -1:
          print(f'Crawl error: {osu_fn.name}')
          move_file(osu_fn, skipped_path / osu_fn.name)
          continue
        if len(star_rating) == 1:
          star_rating = star_rating + '00'
        elif len(star_rating) == 2:
          star_rating = star_rating + '0'
        
        new_osu_fn = output_path / f'{setId}-{star_rating}-{key_count}.osu'
        move_file(osu_fn, new_osu_fn)

        if int(star_rating) > int(star_rating_upper_bound):
          move_file(new_osu_fn, excluded_path / new_osu_fn.name)

def get_info(osu_fn):
  with open(osu_fn, mode='r', encoding='utf-8') as f:
      raw_content = f.read().splitlines()

  start_index = raw_content.index('[General]')
  mode = re.sub(r"\D", "", raw_content[start_index + 7])
  audio_fn = raw_content[start_index + 1][15:]

  start_index = raw_content.index('[Metadata]')
  setId = re.sub(r"\D", "", raw_content[start_index + 10])
  id = re.sub(r"\D", "", raw_content[start_index + 9])

  return audio_fn, mode, setId, id
  
# get infos from web
def crawl(setId, id):
  try:
    # response = requests.get(
    #     url='https://app.scrapingbee.com/api/v1/',
    #     params={
    #         'api_key': 'P4QN19DR7W73O8U0ZYE8W2ECLUZJ5X6F322K7N2CDS6D8ON16VARHYZI8ER3J79WUE80KWK91WY3MQU1',
    #         'url': f'https://osu.ppy.sh/beatmapsets/{setId}#mania/{id}', 
    #         'wait': '200' 
    #     },
    # )
    # soup = BeautifulSoup(response.content)
    conn = http.client.HTTPSConnection("api.scrapingant.com")
    conn.request("GET", f"/v2/general?url=https%3A%2F%2Fosu.ppy.sh%2Fbeatmapsets%2F{setId}%23mania%2F{id}&x-api-key=ebb087837ed7450e8557976d9fcc19f6&browser=false")
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    soup = BeautifulSoup(data)
    # find datas
    text =soup.find('script', attrs={"id" : "json-beatmapset"}).get_text()

    idx =text.find(str(id))
    sr_idx = text[:idx].rfind("diff")
    kc_idx = text[idx:].find("cs")+idx

    star_rating = text[sr_idx-1:sr_idx+23]
    key_count = text[kc_idx-1:kc_idx+5]
    return re.sub(r"\D", "", star_rating), re.sub(r"\D", "", key_count)
  except BaseException as e:
    print(e)
    return -1, -1

In [9]:
def convert_osu(osu_fn_list, output_path:Path, star_rating_upper_bound=400):
    excluded_path = output_path / 'excluded/'
    excluded_path.mkdir(exist_ok=True)

    skipped_path = output_path / 'skipped/'
    skipped_path.mkdir(exist_ok=True)

    for osu_fn in tqdm(osu_fn_list):
        audio_fn, mode, setId, id = get_info(osu_fn)
        audio_fn = output_path / audio_fn

        if mode != '3':
            osu_fn.unlink()
            continue

        star_rating, key_count = crawl(setId, id)
        if star_rating == -1:
            print(f'ERROR Crawl failed: {osu_fn.name}')
            move_file(osu_fn, skipped_path / osu_fn.name)
            continue

        if len(star_rating) == 1:
            star_rating = star_rating + '00'
        elif len(star_rating) == 2:
            star_rating = star_rating + '0'

        new_audio_fn = output_path / (setId + audio_fn.suffix)
        if not new_audio_fn.exists():
            print(f'WARN audio not found: f{new_audio_fn.name}')

        new_osu_fn = output_path / f'{setId}-{star_rating}-{key_count}.osu'
        move_file(osu_fn, new_osu_fn)

        if int(star_rating) > int(star_rating_upper_bound):
            move_file(new_osu_fn, excluded_path / new_osu_fn.name)

In [2]:
response = requests.get(
        url='https://app.scrapingbee.com/api/v1/',
        params={
            'api_key': 'P4QN19DR7W73O8U0ZYE8W2ECLUZJ5X6F322K7N2CDS6D8ON16VARHYZI8ER3J79WUE80KWK91WY3MQU1',
            'url': f'https://osu.ppy.sh/beatmapsets/1865663#mania/3850209', 
            'wait': '200'
        },
    )
print(response)

<Response [200]>


In [3]:
unzip_osz(sorted(list(Path('osu_dataset/').glob('*.osz'))), Path('osu_dataset/'))

  0%|          | 0/138 [00:00<?, ?it/s]

'NoneType' object has no attribute 'get_text'
Crawl error: nekodex - circles! (FAMoss) [normal!].osu
'NoneType' object has no attribute 'get_text'
Crawl error: Camellia feat. Nanahira - Versus! (Cut Ver.) (Rhezie) [Fight!].osu
'NoneType' object has no attribute 'get_text'
Crawl error: capitaro - Tenshinranman Haikara Hime (Syadow-) [MX].osu


In [10]:
convert_osu(sorted(list(Path('osu_dataset/skipped/').glob('*.osu'))), Path('osu_dataset/'))

  0%|          | 0/116 [00:00<?, ?it/s]