# Osu Dataset Maker

**How To Use**


*   Make 'osu_dataset' folder.
*   Put .osz files downloaded from osu beatmap website into the folder.
*   Run the code.




**File Name Format**


*   {setId}-{star_rating}-{key_count}.osu

*   {setId}.mp3
  *  {setId} : beatmap set id (can search in osu beatmap website by this id)
  *  {key_count} : number of using keys
  *  {star_rating} : osu ★ difficulty
 
*  ex) 443632-163-4.osu 
 *  ID 443632
 *  ★ 1.63
 *  4 key 

※ Each songs can have more than 1 beatmap.
(different difficulty or key)


In [None]:
# import libraries

import os
import zipfile
import re
import requests
from bs4 import BeautifulSoup

In [2]:
# input the folder path

folder_path = 'osu_dataset' # <== folder path name

In [None]:
# make .osz files list

file_list = os.listdir(folder_path)
osz_files = [os.path.join(folder_path, file) for file in file_list if file.endswith('.osz')]
osz_files

['osu_dataset/63089 fripSide - only my railgun (TV Size).osz',
 'osu_dataset/106212 LeaF - MEPHISTO.osz',
 'osu_dataset/126752 Yun_chi - Your song_.osz']

In [None]:
'''
Functions for
1. unzip .osz files
2. rename .osu .mp3 files
'''

def unzip_osz(file_path):

  # Change .osz to .zip
  new_file_path = file_path[:-4] + '.zip'
  os.rename(file_path, new_file_path)

  # Unzip .zip
  files = []
  with zipfile.ZipFile(new_file_path, 'r') as zip_ref:
    for file in zip_ref.namelist():
      if file.endswith('.osu') or file.endswith('.mp3'):
        zip_ref.extract(file, os.path.dirname(new_file_path))
        files.append(file)

  # Rename files
  for i in range(len(files)):
    if files[i].endswith('.mp3'):
      tmp = files[i]
      files[i] = files[len(files)-1]
      files[len(files)-1] = tmp

    if files[i].endswith('.osu'):
      setId, star_rating, key_count = get_info(os.path.join(os.path.dirname(file_path), files[i]))
      new_file_name = f"{setId}-{star_rating}-{key_count}.osu" 
      old_file_path = os.path.join(os.path.dirname(file_path), files[i])
      new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)
      os.rename(old_file_path, new_file_path)
    elif files[i].endswith('.mp3'):
      new_file_name = f"{setId}.mp3" 
      old_file_path = os.path.join(os.path.dirname(file_path), files[i])
      new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)
      os.rename(old_file_path, new_file_path)

# get infos from .osu file and web( using crawler() function below )
def get_info(file):
  with open(file, mode='r', encoding='utf-8') as f:
      raw_content = f.read().splitlines()

  start_index = raw_content.index('[Metadata]')
  id = re.sub(r"\D", "", raw_content[start_index + 9])  # 숫자가 아닌 문자 제거
  setId = re.sub(r"\D", "", raw_content[start_index + 10])

  star_rating, key_count = crawler(setId, id)

  return (setId, star_rating, key_count)
  
# get infos from web
def crawler(setId, id):
  url = "https://osu.ppy.sh/beatmapsets/" + str(setId) + "#mania/" + str(id)
  web = requests.get(url)
  soup = BeautifulSoup(web.content, "html.parser")
  # find datas
  text =soup.find('script', attrs={"id" : "json-beatmapset"}).get_text()

  idx =text.find(str(id))
  sr_idx = text[:idx].rfind("diff")
  kc_idx = text[idx:].find("cs")+idx

  star_rating = text[sr_idx-1:sr_idx+23]
  key_count = text[kc_idx-1:kc_idx+5]
  return re.sub(r"\D", "", star_rating), re.sub(r"\D", "", key_count)

In [None]:
# Run the functions for the given .osz files

for osz in osz_files:
  print(osz)
  unzip_osz(osz)

osu_dataset/63089 fripSide - only my railgun (TV Size).osz
osu_dataset/106212 LeaF - MEPHISTO.osz
osu_dataset/126752 Yun_chi - Your song_.osz




---



To use only real osumania beatmap,
run the code below

*OsuMania 호환 Osu 비트맵 지우는 코드, 실행하면 찐 osumania 비트맵만 남기기 가능*

In [None]:
file_list = os.listdir(folder_path)
osu_files = [os.path.join(folder_path, file) for file in file_list if file.endswith('.osu')]
for osu_path in osu_files:
  with open(osu_path, mode='r', encoding='utf-8') as f:
      raw_content = f.read().splitlines()

  start_index = raw_content.index('[General]')
  mode = re.sub(r"\D", "", raw_content[start_index + 7])  # 숫자가 아닌 문자 제거
  if mode != '3':
    print('removed : ', osu_path)
    os.remove(osu_path)



---



Can set difficulty threshold (upper bound)

To set, change the threshold value and run the code below.

*Star rating 최댓값 설정 가능. 나머지 다 지워버림*

In [None]:
upper_bound  = '270'  # format : 3 number string 2.17 => 217

file_list = os.listdir(folder_path)
for file in file_list:
  if file.endswith('.osu'):
    star_rating = file.split('-')[1]

    if len(star_rating) == 1:
      star_rating = star_rating+'00'
    elif len(star_rating) == 2:
      star_rating = star_rating+'0'

    if star_rating > upper_bound:
      print('removed : ', file)
      os.remove(os.path.join(folder_path, file))


removed :  126752-283-4.osu
removed :  106212-285-4.osu




---



Remove .zip file and remain only .osu or .mp3 (cannot undo)

In [None]:
file_list = os.listdir(folder_path)
for file in file_list:
  if file.endswith('.zip'):
    print('removed : ', file)
    os.remove(os.path.join(folder_path, file))


removed :  63089 fripSide - only my railgun (TV Size).zip
removed :  126752 Yun_chi - Your song_.zip
removed :  106212 LeaF - MEPHISTO.zip




---



# 일반쓰레기

쓰다버린코드 아카이브

In [None]:
#Trashes
  '''
  url = "https://osu.ppy.sh/beatmapsets/" + str(setId) + "#mania/" + str(id) 
  print(url)
  response = requests.get(url)
  time.sleep(3)
  soup = BeautifulSoup(response.content, 'html.parser')
  print(soup)
  key_count = soup.find('th', attrs={"class": "beatmap-stats-table__label"}, string = "Key Count")
  print(key_count)
  key_count = key_count.find_next_sibling('td', class_='beatmap-stats-table__value').text

  star_rating = soup.find('th', attrs={"class": "beatmap-stats-table__label"}, string = "Star Rating")
  star_rating = star_rating.find_next_sibling('td', class_='beatmap-stats-table__value').text
  '''

  '''
  setId, star_rating, key_count = get_info(os.path.join(os.path.dirname(file_path), file))
  new_file_name = f"{setId}-{star_rating}-{key_count}.osu"  # 새로운 파일 이름 생성
  extracted_path = os.path.join(os.path.dirname(new_file_path), new_file_name)
  zip_ref.extract(file, extracted_path)
  '''
  
  '''    
  for file in zip_ref.namelist():
    if file.endswith('.mp3'):
      new_file_name = f"{setId}.osu"  # 새로운 파일 이름 생성
      extracted_path = os.path.join(os.path.dirname(new_file_path), new_file_name)
      zip_ref.extract(file, os.path.dirname(new_file_path))
  '''

In [None]:
#전체삭제코드
file_list = os.listdir(folder_path)
files = [os.path.join(folder_path, file) for file in file_list]
for file in files:
  os.remove(file)