In [1]:
import multiprocessing
from PIL import Image
import pandas as pd
import numpy as np
import requests
import hashlib
import string
import json
import sys
import os
import io

In [2]:
root = os.path.join('..','..','..')
data_path = os.path.join(root,'data')
temp_path = os.path.join(data_path,'temp')

# Utility

In [3]:
def to_lower(text):
    return text.lower()

## All the data that we are collecting will be in reference to <a href='http://millionsongdataset.com/'>Million Song Dataset</a>.

# Scrapping Album Art

Source: <a href='https://zenodo.org/record/1240485#.Xvx5DXUzais'>MSD-I: Million Song Dataset with Images for Multimodal Genre Classification</a>

In [13]:
# !wget https://zenodo.org/record/1240485/files/MSD-I_dataset.tsv
# !mv 'MSD-I_dataset.tsv' '../../../Temporary/MSD-I_dataset.tsv'

In [4]:
art_data = pd.read_csv(f'{temp_path}/MSD-I_dataset.tsv',sep='\t')
art_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30713 entries, 0 to 30712
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   msd_track_id   30713 non-null  object
 1   genre          30713 non-null  object
 2   album_index    30713 non-null  int64 
 3   set            30713 non-null  object
 4   msd_artist_id  30713 non-null  object
 5   image_url      30713 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.4+ MB


In [5]:
album_image = art_data.groupby(['album_index']).head(1)
album_image.to_csv(f'{temp_path}album_image.csv')

In [8]:
album_image = pd.read_csv(f'{temp_path}/album_image.csv')
album_image = album_image.loc[:,['genre','image_url']].to_numpy()
album_image = album_image.tolist()

In [9]:
def scrapArt(album):
    try:
      r = requests.get(album[1])
      if r.status_code != 200:
          pass
      img = io.BytesIO(r.content)
      img = Image.open(img).convert('RGB')
      if not os.path.isdir(f'images/{album[0]}'):
          os.mkdir(f'images/{album[0]}')
      with open(f'images/{album[0]}/{hashlib.sha1(r.content).hexdigest()[:10]}.jpg','wb') as f:
          img.save(f,'JPEG',quality=100)
    except:
      pass


In [10]:
pool = multiprocessing.Pool() 
pool = multiprocessing.Pool(processes=os.cpu_count())
outputs = pool.map(scrapArt, album_image)

In [11]:
!zip -r albumart.zip images

# LMD MIDI Dataset

## Getting the MIDI Dataset which has been matched with MSD Dataset

In [12]:
# !wget http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz
# !mv 'lmd_matched.tar.gz' path+'lmd_matched.tar.gz'
# !tar -xf path+'lmd_matched.tar.gz'

In [13]:
path_midi = f'{temp_path}/lmd_matched'
path_midi = os.path.normpath(path_midi)
midi = []
for root,dirs,files in os.walk(path_midi, topdown=True):
    depth = root[len(path_midi) + len(os.path.sep):].count(os.path.sep)
    if depth == 2:
        # We're currently two directories in, so all subdirs have depth 3
        midi += [os.path.join(root, f) for f in dirs]
        dirs[:] = [] # Don't recurse any deeper
midi = [ids.split('/')[-1] for ids in midi]

In [21]:
midi = pd.DataFrame(midi,columns=['msd_track_id'])
midi.to_csv(f'{temp_path}/midi.csv')

Unnamed: 0,msd_track_id
0,TRJJJCW128F42218C7
1,TRJJJAM128F425C444
2,TRJJNTK128F4297ADE
3,TRJJREY128F93123DD
4,TRJJRTB128F92D0791
...,...
31029,TRBBYDT128F14805B4
31030,TRBBYVD128F931AE0C
31031,TRBBSFX128F426D4A1
31032,TRBBSDR12903CAE89C


# Last.FM Data : Train & Test

In [16]:
# !wget http://millionsongdataset.com/sites/default/files/lastfm/lastfm_train.zip
# !mv 'lastfm_train.zip' '../../../Temporary/lastfm_train.zip'
# !unzip '../../../Temporary/lastfm_train.zip'

In [17]:
# !wget http://millionsongdataset.com/sites/default/files/lastfm/lastfm_test.zip
# !mv 'lastfm_test.zip' '../../../Temporary/lastfm_test.zip'
# !unzip '../../../Temporary/lastfm_test.zip'

In [18]:
# !wget http://millionsongdataset.com/sites/default/files/lastfm/tracks_with_tag.txt
# !wget http://millionsongdataset.com/sites/default/files/lastfm/lastfm_unique_tags.txt
# !mv 'tracks_with_tag.txt' '../../../Temporary/tracks_with_tag.txt'
# !mv 'lastfm_unique_tags.txt' '../../../Temporary/lastfm_unique_tags.txt'

In [19]:
lfm_path = temp_path+'/lastfm_train'
lfm_path = os.path.normpath(lfm_path)
midi_path = {}
for root,dirs,files in os.walk(lfm_path, topdown=True):
    depth = root[len(lfm_path) + len(os.path.sep):].count(os.path.sep)
    if depth == 2:
        # We're currently two directories in, so all subdirs have depth 3
        midi_path.update({f[:-5]:os.path.join(root, f) for f in files})
        dirs[:] = [] # Don't recurse any deeper

In [20]:
lfm_path = temp_path+'/lastfm_test'
lfm_path = os.path.normpath(lfm_path)
for root,dirs,files in os.walk(lfm_path, topdown=True):
    depth = root[len(lfm_path) + len(os.path.sep):].count(os.path.sep)
    if depth == 2:
        # We're currently two directories in, so all subdirs have depth 3
        midi_path.update({f[:-5]:os.path.join(root, f) for f in files})
        dirs[:] = [] # Don't recurse any deeper

In [22]:
midi_path_df = pd.DataFrame(midi_path.items(),columns=['msd_track_id','path'])
midi_path_df

Unnamed: 0,msd_track_id,path
0,TRJJJIK128F4230D6F,../../../data/temp/lastfm_train/J/J/J/TRJJJIK1...
1,TRJJJYF12903CBB80B,../../../data/temp/lastfm_train/J/J/J/TRJJJYF1...
2,TRJJJGI128F9320F8B,../../../data/temp/lastfm_train/J/J/J/TRJJJGI1...
3,TRJJJWA128F92FE4E4,../../../data/temp/lastfm_train/J/J/J/TRJJJWA1...
4,TRJJJVY128F9307F73,../../../data/temp/lastfm_train/J/J/J/TRJJJVY1...
...,...,...
943329,TRBBVYI128F4292587,../../../data/temp/lastfm_test/B/B/V/TRBBVYI12...
943330,TRBBBTE12903CDC67D,../../../data/temp/lastfm_test/B/B/B/TRBBBTE12...
943331,TRBBBPQ128F42A768C,../../../data/temp/lastfm_test/B/B/B/TRBBBPQ12...
943332,TRBBBNA12903CB336B,../../../data/temp/lastfm_test/B/B/B/TRBBBNA12...


# MIDI : Last.FM : Genre

In [23]:
midi['key'] = midi['msd_track_id']
midi

Unnamed: 0,msd_track_id,key
0,TRJJJCW128F42218C7,TRJJJCW128F42218C7
1,TRJJJAM128F425C444,TRJJJAM128F425C444
2,TRJJNTK128F4297ADE,TRJJNTK128F4297ADE
3,TRJJREY128F93123DD,TRJJREY128F93123DD
4,TRJJRTB128F92D0791,TRJJRTB128F92D0791
...,...,...
31029,TRBBYDT128F14805B4,TRBBYDT128F14805B4
31030,TRBBYVD128F931AE0C,TRBBYVD128F931AE0C
31031,TRBBSFX128F426D4A1,TRBBSFX128F426D4A1
31032,TRBBSDR12903CAE89C,TRBBSDR12903CAE89C


In [110]:
tracked = midi.set_index('key').join(midi_path_df.set_index('msd_track_id'),how='inner')
tracked

Unnamed: 0,msd_track_id,path
TRJJJCW128F42218C7,TRJJJCW128F42218C7,../../../Temporary/lastfm_train/J/J/J/TRJJJCW1...
TRJJJAM128F425C444,TRJJJAM128F425C444,../../../Temporary/lastfm_train/J/J/J/TRJJJAM1...
TRJJNTK128F4297ADE,TRJJNTK128F4297ADE,../../../Temporary/lastfm_train/J/J/N/TRJJNTK1...
TRJJRTB128F92D0791,TRJJRTB128F92D0791,../../../Temporary/lastfm_train/J/J/R/TRJJRTB1...
TRJJHZT128F426E985,TRJJHZT128F426E985,../../../Temporary/lastfm_train/J/J/H/TRJJHZT1...
...,...,...
TRBBIHP128F92E40CA,TRBBIHP128F92E40CA,../../../Temporary/lastfm_train/B/B/I/TRBBIHP1...
TRBBYDT128F14805B4,TRBBYDT128F14805B4,../../../Temporary/lastfm_train/B/B/Y/TRBBYDT1...
TRBBYVD128F931AE0C,TRBBYVD128F931AE0C,../../../Temporary/lastfm_train/B/B/Y/TRBBYVD1...
TRBBSFX128F426D4A1,TRBBSFX128F426D4A1,../../../Temporary/lastfm_train/B/B/S/TRBBSFX1...


In [60]:
genres = ['Blues', 'Country', 'Electronic', 'Folk', 'Jazz', 'Latin', 'Metal',
       'New Age', 'Pop', 'Punk', 'Rap', 'Reggae', 'RnB', 'Rock', 'World']
genres = [genre.lower() for genre in genres]

In [79]:
genre_tagged = pd.DataFrame(columns=['track_id','title','artist','genre'])
idx = 0
for id_ in tracked['msd_track_id'].tolist():
    try:
        with open(midi_path[id_]) as file:
          sampleJSON = json.load(file)
        for tag in sampleJSON['tags'][0]:
          if tag.lower() in genres:
            if idx%1000 == 0:
              print(f'{idx}:{tag}')
            genre_tagged.loc[idx] = [sampleJSON['track_id'],sampleJSON['title'],sampleJSON['artist'],tag.lower()]
            idx += 1
            break
    except:
        pass

0:rock
1000:country
2000:rock
3000:pop
4000:country


In [113]:
genre_tagged.groupby('genre').agg({'track_id':'count'})

Unnamed: 0_level_0,track_id
genre,Unnamed: 1_level_1
blues,52
country,732
electronic,214
folk,103
jazz,171
latin,102
metal,56
new age,60
pop,1384
punk,56


In [96]:
album_image = pd.read_csv(f'{path}album_image.csv')
album_image = album_image.drop(['Unnamed: 0'],axis=1)
album_image['genre'] = album_image['genre'].apply(to_lower)


In [98]:
album_image['key'] = album_image['genre']

In [115]:
genre_tagged.to_csv(f'{path}MIDI_with_Genre.csv')

In [116]:
overall_data = album_image.loc[:,['key','genre','image_url',]].set_index('key').join(genre_tagged.set_index('genre'))

In [118]:
overall_data.to_csv(f'{path}Overall_data.csv')

In [119]:
overall_data

Unnamed: 0,genre,image_url,track_id,title,artist
blues,blues,http://artwork-cdn.7static.com/static/img/slee...,TRJSRUH128F92E5A6F,Little Wing,Stevie Ray Vaughan
blues,blues,http://artwork-cdn.7static.com/static/img/slee...,TRNQXYY12903D02EB7,Cold Shot,The Johnny Otis Show
blues,blues,http://artwork-cdn.7static.com/static/img/slee...,TRNZCKD128EF34942B,What A Wonderful World,B.B. King
blues,blues,http://artwork-cdn.7static.com/static/img/slee...,TRNINPZ128F9320188,Going Back to Miami,The Blues Brothers
blues,blues,http://artwork-cdn.7static.com/static/img/slee...,TRRGBTU128E078B046,Moondance (Album Version),Van Morrison
...,...,...,...,...,...
world,world,http://artwork-cdn.7static.com/static/img/slee...,TRJGIAM128F425529B,Tonti,Vusi Mahlasela
world,world,http://artwork-cdn.7static.com/static/img/slee...,TRJIWUT128F4252268,Chelsea Burns,Keren Ann
world,world,http://artwork-cdn.7static.com/static/img/slee...,TRMEBGS128F42B7C5A,Dabah,Ismaël Lô
world,world,http://artwork-cdn.7static.com/static/img/slee...,TRQURFG128F4295001,Dolce Di Giorno,Lucio Battisti
