# Matching Compositions to Recordings

In [1]:
import pickle
import re

import numpy as np
import pandas as pd

### Bringing in Track + Composition Tables

In [2]:
track_list = pd.read_csv('../data/main_wfeats.csv', index_col=0)
comp_artists = pd.read_csv('../data/comp_artists.csv', index_col=0)
compositions = pd.read_csv('../data/compositions.csv', index_col=0)
artist_comp_lookup = pd.read_csv('../data/artist_comp_lookup.csv', index_col=0)

#### Combining Composition Tables

In [3]:
compositions.head()

Unnamed: 0,CID,AID,Title
0,0,360318916,FOR THA LOVE OF MONEY
1,1,530659306,WE THE PEOPLE
2,2,334030418,CELERY-TIME
3,3,442081954,NEUTRON BOMB
4,4,230055482,WILL THE CIRCLE BE UNBROKEN


In [4]:
artist_comp_lookup.head()

Unnamed: 0,CID,PID
0,0,0
1,1933,0
2,17624,0
3,17630,0
4,17633,0


In [5]:
comp_artists.head()

Unnamed: 0,Performer Name,PID
0,BONE,0
1,BONE THUGS N HARMONY,1
2,BONE THUGS N HARMONY FEAT. EAZY-E,2
3,BONE THUGS-N-HARMONY,3
4,BONE THUGS-N-HARMONY (EDITED),4


In [6]:
full_comps = pd.merge(artist_comp_lookup, compositions, on='CID')
full_comps = pd.merge(full_comps, comp_artists, on='PID')

In [7]:
full_comps.head()

Unnamed: 0,CID,PID,AID,Title,Performer Name
0,0,0,360318916,FOR THA LOVE OF MONEY,BONE
1,1933,0,350208616,ETERNAL,BONE
2,17624,0,340371584,DA INTRODUCTION,BONE
3,17630,0,350209419,EAST 1999,BONE
4,17633,0,350208465,EVERYDAY THANG,BONE


#### Examining Track Table

In [8]:
track_list.head()

Unnamed: 0,song_id,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title,danceability,energy,...,pv_dim_3,pv_dim_4,pv_dim_5,pv_dim_6,pv_dim_7,pv_dim_8,pv_dim_9,pv_dim_10,pv_dim_11,pv_dim_12
0,6SluaPiV04KOaRTOIScoff,1995-10-13,6UE7nl9mha6s8z0wFQFIZ2,Robyn,229226.0,False,Robyn Is Here,Show Me Love - Radio Version,0.546,0.643,...,0.231588,0.227392,0.365724,0.220462,0.367808,0.267055,0.344281,0.349016,0.323426,0.480299
1,5qEVq3ZEGr0Got441lueWS,2018-08-10,6S58b0fr8TkWrEHOH4tRVu,Switchfoot,247240.0,False,You Found Me (Unbroken: Path To Redemption),You Found Me (Unbroken: Path To Redemption),0.603,0.802,...,0.384941,0.397085,0.465443,0.237421,0.359981,0.209631,0.283483,0.188632,0.212271,0.49047
2,5kqIPrATaCc2LqxVWzQGbk,2016-04-01,25u4wHJWxCA9vO0CzxAbK7,Lukas Graham,237300.0,False,Lukas Graham,7 Years,0.765,0.473,...,0.341671,0.321183,0.195459,0.330539,0.175221,0.328568,0.153059,0.221073,0.444818,0.203276
3,3aVyHFxRkf8lSjhWdJ68AW,2013-01-01,0C0XlULifJtAgn6ZNCW2eu,The Killers,262000.0,False,Direct Hits,Just Another Girl,0.547,0.779,...,0.229995,0.264792,0.180531,0.281061,0.355194,0.189039,0.256742,0.193406,0.25314,0.308046
4,0zIyxS6QxZogHOpGkI6IZH,2018-09-07,0le01dl1WllSHhjEXRl4in,Tamia,236545.0,False,Passion Like Fire,Deeper,0.438,0.288,...,0.233717,0.128174,0.32137,0.20037,0.391387,0.132925,0.265942,0.537358,0.158429,0.26679


#### Standardizing Composition and Track Tables

In [9]:
full_comps['Title_n'] = full_comps['Title'].apply(lambda x: x.lower())
full_comps['Performer_n'] = full_comps['Performer Name'].apply(lambda x: str(x).lower())

track_list['artist_name_n'] = track_list['artist_name'].apply(lambda x: str(x).lower()).apply(lambda x: str(x).strip("''/*"))
track_list['song_title_n'] = track_list['song_title'].apply(lambda x: str(x).lower()).apply(lambda x: re.sub(r'(\(feat.*)','', x))

### Combining Tables

In [11]:
lol_test = pd.merge(track_list, full_comps, how='left', left_on=['artist_name_n', 'song_title_n'],
                    right_on=['Performer_n', 'Title_n'])

In [13]:
len(lol_test[lol_test['CID'].notnull()])

6509

Not bad for literally doing nothing but removing caps, and 'featuring' language. Let's see what other formatting issues I can minimize

In [14]:
full_comps[['Title_n', 'Performer_n']]

Unnamed: 0,Title_n,Performer_n
0,for tha love of money,bone
1,eternal,bone
2,da introduction,bone
3,east 1999,bone
4,everyday thang,bone
5,look into my eyes,bone
6,mo' murda,bone
7,tha crossroads,bone
8,thuggish ruggish bone,bone
9,1st of tha month,bone


In [18]:
track_list.sort_values('artist_name_n', axis=0, inplace=True)

In [19]:
track_list[['artist_name_n', 'song_title_n']]

Unnamed: 0,artist_name_n,song_title_n
12030,"""weird al"" yankovic","amish paradise (parody of ""gangsta's paradise""..."
5278,"""weird al"" yankovic",foil
4320,"""weird al"" yankovic","white & nerdy (parody of ""ridin'"" by chamillio..."
13864,"""weird al"" yankovic","the saga begins (lyrical adaption of ""american..."
4887,"""weird al"" yankovic","party in the cia (parody of ""party in the u.s...."
13772,"""weird al"" yankovic",tacky
10727,"""weird al"" yankovic",the hamilton polka
14337,"""weird al"" yankovic",handy
8393,"""weird al"" yankovic",word crimes
4994,"""weird al"" yankovic",captain underpants theme song
