# Osu Dataset Maker

**How To Use**


*   Make 'osu_dataset' folder.
*   Put .osz files downloaded from osu beatmap website into the folder.
*   Run the code.




**File Name Format**


*   {setId}-{star_rating}-{key_count}.osu

*   {setId}.mp3
  *  {setId} : beatmap set id (can search in osu beatmap website by this id)
  *  {key_count} : number of using keys
  *  {star_rating} : osu ★ difficulty
 
*  ex) 443632-163-4.osu 
 *  ID 443632
 *  ★ 1.63
 *  4 key 

※ Each songs can have more than 1 beatmap.
(different difficulty or key)


In [1]:
# import libraries

import os
import zipfile
import re
import requests
import time
import http.client
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# input the folder path

folder_path = Path('osu_dataset/') # <== folder path name

In [6]:
osz_fns = sorted(list(folder_path.glob('*.osz')))

In [2]:
# Test: rename all ZIP to OSU

zips = Path('osu_dataset').glob('*.zip')
for zip in zips:
    zip.rename(zip.with_suffix('.osz'))

In [2]:
'''
Functions for
1. unzip .osz files
2. rename .osu .mp3 files
'''

def unzip_osz(osz_fn_list, output_path:Path, star_rating_upper_bound=400):
  excluded_path = output_path / 'excluded/'
  excluded_path.mkdir(exist_ok=True)

  zip_path = output_path / 'zip/'
  zip_path.mkdir(exist_ok=True)

  skipped_path = output_path / 'skipped'
  skipped_path.mkdir(exist_ok=True)

  for fn in tqdm(osz_fn_list):
    # Change .osz to .zip
    zip_fn = fn.with_suffix('.zip')
    fn.rename(zip_fn)

    # Unzip .zip
    unzipped_audio_fns = []
    unzipped_osu_fns = []
    with zipfile.ZipFile(zip_fn, 'r') as zip_ref:
      for file in zip_ref.namelist():
        if file.endswith('.mp3') or file.endswith('.ogg') or file.endswith('.wav'):
          zip_ref.extract(file, output_path)
          unzipped_audio_fns.append(output_path / file)
        elif file.endswith('.osu'):
          zip_ref.extract(file, output_path)
          unzipped_osu_fns.append(output_path / file)
    zip_fn.rename(zip_path / zip_fn.name)
    
    audio_fn, _, setId, _ = get_info(unzipped_osu_fns[0])
    audio_fn = output_path / audio_fn
    for f in unzipped_audio_fns:
      if f != audio_fn:
        f.unlink()
    audio_fn.rename(output_path / (setId + audio_fn.suffix))

    for osu_fn in unzipped_osu_fns:
      audio_fn, mode, setId, id = get_info(osu_fn)
      if mode != '3':
        osu_fn.unlink()
      else:
        star_rating, key_count = crawl(setId, id)
        if star_rating == -1:
          print(f'Crawl error: {osu_fn.name}')
          osu_fn.rename(skipped_path / osu_fn.name)
          continue
        if len(star_rating) == 1:
          star_rating = int(star_rating + '00')
        elif len(star_rating) == 2:
          star_rating = int(star_rating + '0')
        else:
          star_rating = int(star_rating)
        
        new_osu_fn = output_path / f'{setId}-{star_rating}-{key_count}.osu'
        try:
          osu_fn.rename(new_osu_fn)
        except:
          new_osu_fn = output_path / f'{setId}-{star_rating}-{key_count}-{id}.osu'
        if star_rating > star_rating_upper_bound:
          new_osu_fn.rename(excluded_path / new_osu_fn.name)

# get infos from .osu file and web( using crawler() function below )

def get_info(osu_fn):
  with open(osu_fn, mode='r', encoding='utf-8') as f:
      raw_content = f.read().splitlines()

  start_index = raw_content.index('[General]')
  mode = re.sub(r"\D", "", raw_content[start_index + 7])
  audio_fn = raw_content[start_index + 1][15:]

  start_index = raw_content.index('[Metadata]')
  setId = re.sub(r"\D", "", raw_content[start_index + 10])
  id = re.sub(r"\D", "", raw_content[start_index + 9])

  return audio_fn, mode, setId, id
  
# get infos from web
def crawl(setId, id):
  try:
    conn = http.client.HTTPSConnection("api.scrapingant.com")
    conn.request("GET", f"/v2/general?url=https%3A%2F%2Fosu.ppy.sh%2Fbeatmapsets%2F{setId}%23mania%2F{id}&x-api-key=ebb087837ed7450e8557976d9fcc19f6")
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    soup = BeautifulSoup(data, "html.parser")
    # find datas
    text =soup.find('script', attrs={"id" : "json-beatmapset"}).get_text()

    idx =text.find(str(id))
    sr_idx = text[:idx].rfind("diff")
    kc_idx = text[idx:].find("cs")+idx

    star_rating = text[sr_idx-1:sr_idx+23]
    key_count = text[kc_idx-1:kc_idx+5]
    return re.sub(r"\D", "", star_rating), re.sub(r"\D", "", key_count)
  except:
    return -1, -1

In [3]:
unzip_osz(sorted(list(Path('osu_dataset/').glob('*.osz'))), Path('osu_dataset/'))

 12%|█▏        | 44/372 [55:02<7:28:50, 82.11s/it] 

Crawl error: Masayoshi Minoshima - Bad Apple!! (REDALiCE Remix) (Evening) [Lv.8 4K].osu


 21%|██        | 78/372 [1:41:18<7:19:51, 89.77s/it]  

Crawl error: Meg & Dia - Monster (DotEXE Remix) (nold_1702) [Zillah's 4K Normal].osu


 31%|███       | 115/372 [2:44:39<6:07:59, 85.91s/it]  


FileNotFoundError: [WinError 2] The system cannot find the file specified: 'osu_dataset\\Canon Rock (JerryC) - The Original.mp3' -> 'osu_dataset\\203734.mp3'



---





---



Can set difficulty threshold (upper bound)

To set, change the threshold value and run the code below.

*Star rating 최댓값 설정 가능. 나머지 다 지워버림*

In [None]:
upper_bound  = '270'  # format : 3 number string 2.17 => 217

file_list = os.listdir(folder_path)
for file in file_list:
  if file.endswith('.osu'):
    star_rating = file.split('-')[1]

    if len(star_rating) == 1:
      star_rating = star_rating+'00'
    elif len(star_rating) == 2:
      star_rating = star_rating+'0'

    if star_rating > upper_bound:
      print('removed : ', file)
      os.remove(os.path.join(folder_path, file))


removed :  126752-283-4.osu
removed :  106212-285-4.osu




---



Remove .zip file and remain only .osu or .mp3 (cannot undo)

In [None]:
file_list = os.listdir(folder_path)
for file in file_list:
  if file.endswith('.zip'):
    print('removed : ', file)
    os.remove(os.path.join(folder_path, file))


removed :  63089 fripSide - only my railgun (TV Size).zip
removed :  126752 Yun_chi - Your song_.zip
removed :  106212 LeaF - MEPHISTO.zip




---



# 일반쓰레기

쓰다버린코드 아카이브

In [None]:
#Trashes
  '''
  url = "https://osu.ppy.sh/beatmapsets/" + str(setId) + "#mania/" + str(id) 
  print(url)
  response = requests.get(url)
  time.sleep(3)
  soup = BeautifulSoup(response.content, 'html.parser')
  print(soup)
  key_count = soup.find('th', attrs={"class": "beatmap-stats-table__label"}, string = "Key Count")
  print(key_count)
  key_count = key_count.find_next_sibling('td', class_='beatmap-stats-table__value').text

  star_rating = soup.find('th', attrs={"class": "beatmap-stats-table__label"}, string = "Star Rating")
  star_rating = star_rating.find_next_sibling('td', class_='beatmap-stats-table__value').text
  '''

  '''
  setId, star_rating, key_count = get_info(os.path.join(os.path.dirname(file_path), file))
  new_file_name = f"{setId}-{star_rating}-{key_count}.osu"  # 새로운 파일 이름 생성
  extracted_path = os.path.join(os.path.dirname(new_file_path), new_file_name)
  zip_ref.extract(file, extracted_path)
  '''
  
  '''    
  for file in zip_ref.namelist():
    if file.endswith('.mp3'):
      new_file_name = f"{setId}.osu"  # 새로운 파일 이름 생성
      extracted_path = os.path.join(os.path.dirname(new_file_path), new_file_name)
      zip_ref.extract(file, os.path.dirname(new_file_path))
  '''

In [None]:
#전체삭제코드
file_list = os.listdir(folder_path)
files = [os.path.join(folder_path, file) for file in file_list]
for file in files:
  os.remove(file)