In [152]:
import nltk
import os
import pandas as pd

from zipfile import ZipFile
import re
import pickle

## Read data and do some pre-processing

In [153]:
# Read data 
zip_file = ZipFile('data/discogs/discogs_larger.zip')
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}

discog_list = []
for key in dfs.keys():
    discog_list.append(pd.DataFrame(dfs.get(key)))

df = pd.concat(discog_list)

# Fix column formatting
df.release_descriptions = df.release_descriptions.apply(str)
df.styles = df.styles.apply(str)
df.genre = df.genre.apply(str)
df['labels'] = df['labels'].apply(str)
del df['Unnamed: 0']


In [154]:
# Create boolean columns for mixed and electronic albums

bool_mixed = []
for i in df.release_descriptions:
    bool_mixed.append('Mixed' in i)
df['bool_R_mixed'] = bool_mixed

bool_electronic = []
for i in df.genre:
    bool_electronic.append('Electronic' in i)
df['bool_R_electronic'] = bool_electronic

# Extract out styles for exploratory analysis

In [155]:
style_db = []
for i in df.styles:
    style_db.extend(re.sub(pattern='\[|\]|\'', repl='', string=i).split(', '))
from collections import Counter
counts = Counter(style_db)
print(counts)

Counter({'Techno': 78313, 'House': 77893, 'Tech House': 51076, 'Deep House': 37197, 'Electro': 27371, 'Minimal': 23848, 'Experimental': 20198, 'Ambient': 19388, 'Downtempo': 19023, 'Progressive House': 18720, 'Trance': 17537, 'Disco': 13523, 'IDM': 11653, 'Progressive Trance': 9043, 'Abstract': 8826, 'Hardcore': 8698, 'Breaks': 7434, 'Breakbeat': 7204, 'Acid': 7081, 'Gabber': 6840, 'Synth-pop': 6655, 'Leftfield': 6444, 'Dubstep': 6383, 'Drum n Bass': 6262, 'Hard Trance': 5686, 'Hip Hop': 5081, 'Garage House': 4870, 'Trip Hop': 4405, 'Future Jazz': 4190, 'Industrial': 4094, 'Hard House': 3626, 'Electro House': 3561, 'Dub Techno': 3462, 'nan': 3333, 'Indie Rock': 3043, 'Happy Hardcore': 2986, 'Acid House': 2921, 'Dub': 2912, 'Euro House': 2747, 'New Wave': 2493, 'UK Garage': 2295, 'Funk': 2276, 'Vaporwave': 2256, 'Tribal House': 1994, 'Contemporary': 1968, 'Grime': 1934, 'Soul': 1922, 'Psy-Trance': 1912, 'Glitch': 1892, 'Instrumental': 1889, 'Broken Beat': 1880, 'Modern Classical': 1858,

In [156]:
# Remove styles that aren't useful for DJing
delete_these = []
for i in counts:
    if re.search(pattern='Rock|Metal|Parody|Folk|Krautrock|Score|J-pop|Pop|Instrumental|Lounge|Free Jazz|nan|Free Improvisation|Rhythm \& Blues|Modern Classical|Punk|Interview|Thrash|Shoegaze|Education|Spoken Word|Acoustic|Neofolk|Field Recording', 
                 string=i) is not None:
        delete_these.append(i)

In [157]:
for i in delete_these:
    counts.pop(i)

In [158]:
counts.most_common(100)

[('Techno', 78313),
 ('House', 77893),
 ('Tech House', 51076),
 ('Deep House', 37197),
 ('Electro', 27371),
 ('Minimal', 23848),
 ('Experimental', 20198),
 ('Ambient', 19388),
 ('Downtempo', 19023),
 ('Progressive House', 18720),
 ('Trance', 17537),
 ('Disco', 13523),
 ('IDM', 11653),
 ('Progressive Trance', 9043),
 ('Abstract', 8826),
 ('Hardcore', 8698),
 ('Breaks', 7434),
 ('Breakbeat', 7204),
 ('Acid', 7081),
 ('Gabber', 6840),
 ('Synth-pop', 6655),
 ('Leftfield', 6444),
 ('Dubstep', 6383),
 ('Drum n Bass', 6262),
 ('Hard Trance', 5686),
 ('Hip Hop', 5081),
 ('Garage House', 4870),
 ('Trip Hop', 4405),
 ('Future Jazz', 4190),
 ('Industrial', 4094),
 ('Hard House', 3626),
 ('Electro House', 3561),
 ('Dub Techno', 3462),
 ('Happy Hardcore', 2986),
 ('Acid House', 2921),
 ('Dub', 2912),
 ('Euro House', 2747),
 ('New Wave', 2493),
 ('UK Garage', 2295),
 ('Funk', 2276),
 ('Vaporwave', 2256),
 ('Tribal House', 1994),
 ('Contemporary', 1968),
 ('Grime', 1934),
 ('Soul', 1922),
 ('Psy-Tran

As we can see there are far too many genres/styles at the moment. We will filter to only include albums that are "Electronic"

In [159]:
#filter for only electronic music
df = df.reset_index(drop = True)
df = pd.DataFrame(df)
df = df[df['bool_R_electronic']]

In [160]:
style_db = []
for i in df.styles:
    style_db.extend(re.sub(pattern='\[|\]|\'', repl='', string=i).split(', '))
from collections import Counter
counts = Counter(style_db)
#print(counts)
# Remove styles that aren't useful for DJing
delete_these = []
for i in counts:
    if re.search(pattern='Rock|Metal|Parody|Folk|Krautrock|Score|J-pop|Pop|Instrumental|Lounge|Free Jazz|nan|Free Improvisation|Rhythm \& Blues|Modern Classical|Punk|Interview|Thrash|Shoegaze|Education|Spoken Word|Acoustic|Neofolk|Field Recording', 
                 string=i) is not None:
        delete_these.append(i)
for i in delete_these:
    counts.pop(i)
counts.most_common(100)

[('Techno', 78300),
 ('House', 77893),
 ('Tech House', 51076),
 ('Deep House', 37197),
 ('Electro', 27363),
 ('Minimal', 23814),
 ('Experimental', 19846),
 ('Ambient', 19380),
 ('Downtempo', 18958),
 ('Progressive House', 18720),
 ('Trance', 17537),
 ('Disco', 13022),
 ('IDM', 11653),
 ('Progressive Trance', 9043),
 ('Abstract', 8810),
 ('Hardcore', 8440),
 ('Breaks', 7384),
 ('Breakbeat', 7199),
 ('Acid', 7081),
 ('Gabber', 6840),
 ('Synth-pop', 6655),
 ('Leftfield', 6432),
 ('Dubstep', 6383),
 ('Drum n Bass', 6262),
 ('Hard Trance', 5686),
 ('Hip Hop', 5060),
 ('Garage House', 4870),
 ('Trip Hop', 4382),
 ('Future Jazz', 4186),
 ('Industrial', 4076),
 ('Hard House', 3626),
 ('Electro House', 3561),
 ('Dub Techno', 3449),
 ('Happy Hardcore', 2986),
 ('Acid House', 2921),
 ('Euro House', 2747),
 ('Dub', 2744),
 ('UK Garage', 2295),
 ('Vaporwave', 2256),
 ('New Wave', 2118),
 ('Tribal House', 1994),
 ('Grime', 1934),
 ('Funk', 1913),
 ('Psy-Trance', 1912),
 ('Glitch', 1892),
 ('Broken B

## Create booleans for genres, separating mixed albums from regular releases

In [161]:
def search_release_genres(genre_extract, genre_search_release):
    search_results = {}
    for i in genre_search:
        if re.search(pattern=i, string=str(genre_extract)):
            search_results['bool_R_%s' % i.replace(" ", "_")] = True
        else:
            search_results['bool_R_%s' % i.replace(" ", "_")] = False
    return(search_results)

In [162]:
genre_search_release = ['Techno', 'House', 'Tech House','Trance', 'Disco', 
                'Psy-Trance', 'Electro', 'IDM', 'Drum n Bass', 'Garage',
                'Hard', 'Progressive', 'Acid', 'Deep', 'Minimal', 'Dub', 'Industrial', 'Hip Hop', 'Funk',
                   'Leftfield', 'Abstract', 'Experimental']   

In [163]:
df = df.reset_index(drop = True)
compiled_genres = []
for i in range(0, len(df)):
    compiled_genres.append(search_release_genres(df['styles'][i], genre_search_release))

In [164]:
compiled_genres = pd.DataFrame(compiled_genres)

In [165]:
df = pd.concat([df.reset_index(drop=True), compiled_genres.reset_index(drop = True)], axis=1)

In [166]:
#for i,j in counts.most_common(100):
#    h = re.sub(pattern=' ', repl='_', string=i)
#    x=('bool_R_%s' % h) 
#    y=[]
#    for j in df.styles:
#        y.append(i in j)
#    print('Applied style %s' % i )
#    df[x] = y

In [174]:
# Split mixed releases from regular ones
df_mixed = df[df['bool_R_mixed'] == True]
df_mixed = df_mixed.reset_index(drop=True)
df_releases = df[df['bool_R_mixed'] == False]
df_releases = df_releases.reset_index(drop=True)

In [175]:
df_releases.head(n=25)

Unnamed: 0,artist,cat_no,country,genre,label_root,labels,length,position,release,release_artist,...,bool_R_House,bool_R_IDM,bool_R_Industrial,bool_R_Leftfield,bool_R_Minimal,bool_R_Progressive,bool_R_Psy-Trance,bool_R_Tech_House,bool_R_Techno,bool_R_Trance
0,Unknown Artist,303,UK,['Electronic'],Planet Rhythm Records,['Planet Rhythm Records'],,A1,303 EP,Unknown Artist,...,False,False,False,False,False,False,False,False,True,False
1,Unknown Artist,303,UK,['Electronic'],Planet Rhythm Records,['Planet Rhythm Records'],,A2,303 EP,Unknown Artist,...,False,False,False,False,False,False,False,False,True,False
2,Unknown Artist,303,UK,['Electronic'],Planet Rhythm Records,['Planet Rhythm Records'],,B1,303 EP,Unknown Artist,...,False,False,False,False,False,False,False,False,True,False
3,Unknown Artist,303,UK,['Electronic'],Planet Rhythm Records,['Planet Rhythm Records'],,B2,303 EP,Unknown Artist,...,False,False,False,False,False,False,False,False,True,False
4,Ausgang,BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],7:13,A1,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True
5,Vector,BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],5:35,A2,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True
6,The Fatima Mansions,BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],7:23,B1,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True
7,Spectral,BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],6:58,B2,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True
8,Virtuality (2),BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],6:11,C1,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True
9,Har-El,BR001LP,UK,['Electronic'],Planet Rhythm Records,['Blue Room Released'],6:36,C2,Blue Room Released Vol: 1 - Outside The Reactor,Various,...,False,False,False,False,False,False,False,False,False,True


In [176]:
df_mixed.head(n=25)

Unnamed: 0,artist,cat_no,country,genre,label_root,labels,length,position,release,release_artist,...,bool_R_House,bool_R_IDM,bool_R_Industrial,bool_R_Leftfield,bool_R_Minimal,bool_R_Progressive,bool_R_Psy-Trance,bool_R_Tech_House,bool_R_Techno,bool_R_Trance
0,Vector,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,1.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
1,Vector,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,2.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
2,Vector,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,3.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
3,Vector,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,4.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
4,Adam Beyer,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,5.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
5,Adam Beyer,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,6.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
6,Adam Beyer,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,7.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
7,R.N.D. Technologies,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,8.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
8,Thomas Krome,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,9.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False
9,Abstract Soul,EKCY-10001,Japan,['Electronic'],Planet Rhythm Records,"['EPS', 'Planet Rhythm Records']",,10.0,Planet Rhythm Label Collection Vol.001,Various,...,False,False,False,False,True,False,False,False,True,False


In [177]:
df_mixed.to_csv('discogs_mixed.csv')
df_releases.to_csv('discogs_releases.csv')