In [76]:
import requests
from pathlib import Path
from urllib.request import urlretrieve
from urllib.error import HTTPError
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from tqdm import tqdm

## VGMusic

In [77]:
def get_systems_links():
    response = requests.get('https://www.vgmusic.com/')
    soup = BeautifulSoup(response.content, 'html.parser')
    menu = soup.find('a', href='/music/console/nintendo/nes/').parent
    categories = menu.find_all('a')
    links = {category.text: 'https://www.vgmusic.com/' + category['href'] for category in categories}
    return links

In [78]:
def get_midi_links(system_link, system_name):
    response = requests.get(system_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    trs = soup.find_all('tr')
    
    links = []
    game_name = None
    for tr in trs:
        if 'class' in tr.attrs and 'gameheader' in tr['class']:
            game_name = tr.find('td').text
        
        tds = tr.find_all('td')
        anchors = [list(td.children)[0] for td in tds if len(list(td.children)) > 0 and list(td.children)[0].name == 'a']
        for anchor in anchors:
            if not '.mid' in anchor['href']:
                continue
            entry = {
                'system': system_name,
                'game': game_name,
                'song': anchor.text,
                'link': system_link + anchor['href'].lstrip('/')
            }
            links.append(entry)
            
    return links

In [79]:
def get_all_midi_links():
    all_links = []
    systems_links = get_systems_links()
    for system_name, system_link in tqdm(systems_links.items(), 'scrapping systems'):
        links = get_midi_links(system_link, system_name)
        all_links.extend(links)
    return all_links

In [81]:
def download_midi(entry):
    path = Path('../../data/vgmusic') / entry['system'] / entry['game'] / (entry['song'].replace('/', ' ').replace('\\', ' ') + '.mid')
    path.parent.mkdir(parents=True, exist_ok=True)
    if not path.exists():
        urlretrieve(entry['link'], str(path))

In [None]:
links = get_all_midi_links()

scrapping systems:   0%|          | 0/56 [00:00<?, ?it/s]

scrapping systems: 100%|██████████| 56/56 [00:45<00:00,  1.24it/s]


In [None]:
Parallel(n_jobs=256)(delayed(download_midi)(link) for link in links)

## khinsider.com

In [60]:
def get_links(parent_link):
    response = requests.get(parent_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    content_page = soup.find('div', class_='content-page')
    rows = content_page.find('table').find_all('tr')
    first_column = [list(row.find_all('td'))[0] for row in rows]
    anchors = [list(cell.children)[0] for cell in first_column if len(list(cell.children)) > 0 and list(cell.children)[0].name == 'a']
    links = {anchor.text: anchor['href'] for anchor in anchors}
    return links

def get_systems_links():
    return get_links('https://www.khinsider.com/midi')

def get_games_links(system_link):
    return get_links(system_link)

def get_songs_links(game_link):
    return get_links(game_link)

def get_all_songs_links():
    links = []
    for system_name, system_link in tqdm(get_systems_links().items()):
        for game_name, game_link in get_games_links(system_link).items():
            for song_name, song_link in get_songs_links(game_link).items():
                entry = {
                    'system': system_name,
                    'game': game_name,
                    'song': song_name,
                    'link': song_link
                }
                links.append(entry)
    return links

In [74]:
def download_midi(entry):
    path = Path('../../data/khinsider') / entry['system'] / entry['game'] / (entry['song'] + '.mid')
    path.parent.mkdir(parents=True, exist_ok=True)
    if not path.exists():
        try:
            urlretrieve(entry['link'], str(path))
        except Exception as e:
            print('error:', str(e))

In [61]:
songs_links = get_all_songs_links()

100%|██████████| 39/39 [31:53<00:00, 49.07s/it] 


In [None]:
Parallel(n_jobs=256)(delayed(download_midi)(link) for link in songs_links)

In [63]:
len(songs_links)

19998

In [51]:
def get_songs_links(system_link):
    response = requests.get(system_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    content_page = soup.find('div', class_='content-page')
    rows = content_page.find('table').find_all('tr')
    first_column = [list(row.find_all('td'))[0] for row in rows]
    anchors = [list(cell.children)[0] for cell in first_column if len(list(cell.children)) > 0 and list(cell.children)[0].name == 'a']
    links = {anchor.text: anchor['href'] for anchor in anchors}
    return links

In [52]:
get_songs_links('https://www.khinsider.com/midi/2600/pitfall-2')

{'damage theme v1 1': 'https://files.khinsider.com/midifiles/2600/pitfall-2/damage-theme-v1-1-.mid',
 'field': 'https://files.khinsider.com/midifiles/2600/pitfall-2/field.mid',
 'theme': 'https://files.khinsider.com/midifiles/2600/pitfall-2/theme.mid',
 'theme 2  v1 1': 'https://files.khinsider.com/midifiles/2600/pitfall-2/theme-2-v1-1-.mid',
 'theme 3': 'https://files.khinsider.com/midifiles/2600/pitfall-2/theme-3-.mid'}

In [49]:
get_games_links(list(systems_links.values())[0])

{'Asteroids': 'https://www.khinsider.com/midi/2600/asteroids',
 'Cookie Monster Munch': 'https://www.khinsider.com/midi/2600/cookie-monster-munch',
 'Defender 2': 'https://www.khinsider.com/midi/2600/defender-2',
 'Double Dunk': 'https://www.khinsider.com/midi/2600/double-dunk',
 'E.T. - The Extra Terrestrial': 'https://www.khinsider.com/midi/2600/e.t.-the-extra-terrestrial',
 'Frogger': 'https://www.khinsider.com/midi/2600/frogger',
 'Frogger II': 'https://www.khinsider.com/midi/2600/frogger-ii',
 'Kool-Aid Man': 'https://www.khinsider.com/midi/2600/kool-aid-man',
 'Kung Fu Master': 'https://www.khinsider.com/midi/2600/kung-fu-master',
 'M*A*S*H*': 'https://www.khinsider.com/midi/2600/m-a-s-h-',
 'PacMan': 'https://www.khinsider.com/midi/2600/pacman',
 'Pitfall': 'https://www.khinsider.com/midi/2600/pitfall',
 'Pitfall 2': 'https://www.khinsider.com/midi/2600/pitfall-2',
 'Pressure Cooker': 'https://www.khinsider.com/midi/2600/pressure-cooker',
 'Secret Quest': 'https://www.khinsider.

In [44]:
links = get_systems_links()