# Matching Credits to Song Data

In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import re
from sqlalchemy import *
import time

In [2]:
with open('../data/genius_song_details_1.json', 'r') as f:
    genius_song_details_1 = json.load(f)

In [3]:
with open('../data/genius_song_details_2.json', 'r') as f:
    genius_song_details_2 = json.load(f)

### What Information do I Want to Model on?

For sure I need `writer_artists`, but beyond that, I don't know.

In [82]:
# genius_song_details_1[0]['response']['song']['writer_artists']                       #for songwriter info
# genius_song_details_1[0]['response']['song']['writer_artists'][0]['name']            #for writer name
# genius_song_details_1[0]['response']['song']['writer_artists'][0]['id']              #for writer id
# genius_song_details_1[0]['response']['song']['id']                                   #for song id
# genius_song_details_1[0]['response']['song']['title']                                #for song title
# genius_song_details_1[0]['response']['song']['full_title']                           #for full song title
# genius_song_details_1[0]['response']['song']['producer_artists']                     #for credited producer
# genius_song_details_1[0]['response']['song']['primary_artist']                       #for primary artist details
# genius_song_details_1[0]['response']['song']['primary_artist']['name']               #for primary artist name
# genius_song_details_1[0]['response']['song']['primary_artist']['id']                 #for primary artist id

1

### How Many Writers are on a Song?

In [47]:
def writer_counter(song_details_list):
    count_list = []
    for num, entry in enumerate(song_details_list):
        if isinstance(entry, dict):
            count_list.append((len(entry['response']['song']['writer_artists']),  num))
    return max(count_list)

In [162]:
writer_counter(genius_song_details_2)

(35, 35552)

A max of 35 writers per song is going to give me a lot of columns in this combined table

### Tossing Song Details into DataFrame

I'm creating two separate lists, and then merging them to create a full dataframe of song details. The first list consisting of all the song details outside of the writer names, and the second incorporating the song id and writer names.

#### Songwriter Listing per Song

In [4]:
def songwriters_tolist(song_details):
    '''
    Take Genius song details listing, and extract each writer associated with a song id into a new list
    '''
    writer_list = []
    for entry in song_details:
        if isinstance(entry, dict):
            g_song_id = entry['response']['song']['id']
            for writer in entry['response']['song']['writer_artists']:
                writer_list.append({'g_song_id':g_song_id, 'writer_name':writer['name'], 'writer_id':writer['id']})
    return writer_list

##### `genius_song_details_1`

In [5]:
genius_songwriters_1 = songwriters_tolist(genius_song_details_1)

In [6]:
songwriter_df_1 = pd.DataFrame(genius_songwriters_1)
songwriter_df_1.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,1,27663,The Heatmakerz
1,1,1,Cam’ron
2,3,9768,Irv Gotti
3,3,644832,H. Davis
4,3,214470,B. Bacharach


In [7]:
songwriter_df_1.shape

(12715, 3)

##### `genius_song_details_2`

In [8]:
genius_songwriters_2 = songwriters_tolist(genius_song_details_2)

In [9]:
songwriter_df_2 = pd.DataFrame(genius_songwriters_2)
songwriter_df_2.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,8776,27765,Salaam Remi
1,8776,56,Nas
2,8776,81,AZ
3,8785,998,B-Real
4,8874,170,Kurupt


In [10]:
songwriter_df_2.shape

(70163, 3)

#### Concatenating Songwriter Listings & Saving as csv

In [11]:
songwriter_df = pd.concat([songwriter_df_1, songwriter_df_2])

In [14]:
songwriter_df.reset_index().drop('index', 1, inplace=True)

In [15]:
songwriter_df.to_csv('../data/genius_songwriters_1.csv')

#### Songwriter Listing -w- One Line per Song

This isn't working yet

In [158]:
def songwriters_tofdf(song_details):
    '''
    Take Genius song details listing, and extract each writer associated with a song id into a new list
    '''
    writer_list = []
    for entry in song_details:
        songwriters = []
        if isinstance(entry, dict):
            g_song_id = entry['response']['song']['id']
            for index, writer in enumerate(entry['response']['song']['writer_artists']):
                songwriters.append({'writer_name_{}'.format(index):writer['name'], 
                                'writer_id_{}'.format(index):writer['id']})
            writer_list.append(songwriters)
    return writer_list

In [160]:
# songwriters_tofdf(genius_song_details_1)

#### Song Details per Song

In [170]:
def song_details_tolist(song_details):
    '''
    Take Genius song details listing, and extract all important metadata into a new list
    '''
    song_details_list = []
    for entry in song_details:
        if isinstance(entry, dict):
            song = entry['response']['song']
            song_details_list.append({'g_song_id':song['id'], 'g_song_title':song['title'], 'g_full_song_title':song['full_title'],
                                      'g_artist_name':song['primary_artist']['name'], 'g_artist_id':song['primary_artist']['id']})
    return song_details_list

##### `genius_song_details_1`

In [86]:
genius_song_details_1_list = song_details_tolist(genius_song_details_1)

In [121]:
song_details_df_1 = pd.DataFrame(genius_song_details_1_list)
song_details_df_1.set_index('g_song_id', inplace = True)

##### `genius_song_details_2`

In [171]:
genius_song_details_2_list = song_details_tolist(genius_song_details_2)

In [172]:
song_details_df_2 = pd.DataFrame(genius_song_details_2_list)
song_details_df_2.set_index('g_song_id', inplace = True)

#### Concatenating Song Detail Listings & Saving as csv

In [178]:
song_details_df_1.shape + song_details_df_2.shape

(5380, 4, 44620, 4)

In [180]:
song_details_df = pd.concat([song_details_df_1, song_details_df_2])

In [181]:
song_details_df.to_csv('../data/genius_song_details_1.csv')

#### Merging DataFrames

#### Attempt 1: Writing a Function

In [115]:
# def writer_check(song_df, writer_list):
#     '''
#     '''
#     for index in song_df.index:
#         count = 0
#         for writer in writer_list:
#             if writer['g_song_id'] == index:
#                 count += 1
#                 song_df['writer_{}_name'.format(count)].loc[index] = writer['writer_name']
#                 song_df['writer_{}_id'.format(count)].loc[index] = writer['writer_id']
#     return song_df

#### Attempt 2: df merge

In [125]:
# song_details_df_1.merge(songwriter_df_1, on='g_song_id')

## Attempting to Merge

### Merging Song Lists

I'm going to first see how merging the `genius_song_list.csv` and `spotify_song_list.csv` goes. My success there will dictate how merging the actual song records goes.

#### Retrieving Spotify & Genius song lists

In [2]:
engine = create_engine('postgresql://postgres:glide-mortuary-pod-cloy-belong@ec2-54-244-70-11.us-west-2.compute.amazonaws.com:5432/postgres')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x103a322b0>

In [34]:
spotify_songs = pd.read_sql('SELECT * FROM spotify_song_list', con=engine, index_col='s_song_id')
genius_songs = pd.read_sql('SELECT DISTINCT * FROM genius_song_list', con=engine, index_col='g_song_id')

In [15]:
genius_songs.shape

(410121, 3)

#### Creating non-formatted Genius Columns

In [19]:
strip = 

In [39]:
genius_songs['g_artist_n'] = genius_songs['g_artist'].apply(lambda x: str(x).lower()).apply(lambda x: str(x).strip("''/*")) 
genius_songs['g_song_name_n'] = genius_songs['g_song_name'].apply(lambda x: str(x).lower())

##### Removing "feat." language in song titles

In [41]:
genius_songs['g_song_name_n'] = genius_songs['g_song_name_n'].apply(lambda x: re.sub(r'(\(feat.*)','', x))

##### Sanity check

In [51]:
genius_songs[genius_songs['g_song_name_n'] == 'both eyes closed']

Unnamed: 0_level_0,g_song_name,g_artist,g_artist_id,g_artist_n,g_song_name_n
g_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3099481,Both Eyes Closed,Gucci Mane,13,gucci mane,both eyes closed


#### Creating non-formatted Spotify Columns

In [53]:
spotify_songs['artist_name_n'] = spotify_songs['artist_name'].apply(lambda x: str(x).lower()).apply(lambda x: str(x).strip("''/*")) 
spotify_songs['song_title_n'] = spotify_songs['song_title'].apply(lambda x: str(x).lower())

In [55]:
spotify_songs['song_title_n'] = spotify_songs['song_title_n'].apply(lambda x: re.sub(r'(\(feat.*)','', x))

In [56]:
spotify_songs.head()

Unnamed: 0_level_0,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title,artist_name_n,song_title_n
s_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
62bOmKYxYg7dhrC6gH9vFn,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,200400.0,False,No Strings Attached,Bye Bye Bye,nsync,bye bye bye
46n2EGFnPC3tzWCN1Aqe26,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,284760.0,False,No Strings Attached,This I Promise You,nsync,this i promise you
2AW37v0bDyuOzGP3XnmFuA,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,192426.0,False,No Strings Attached,It's Gonna Be Me,nsync,it's gonna be me
594M0rqYMOo8BhMGEdoi5C,1997-05-26,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,211000.0,False,'N Sync,Tearin' up My Heart - Radio Edit,nsync,tearin' up my heart - radio edit
0Jc8qF1mUPo1A96HE9QxZz,2001-07-24,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,238426.0,False,Celebrity,Pop,nsync,pop


#### Checking my DF's in Excel to see What Other Regex Would be Best

In [60]:
genius_songs.to_csv('/Users/jonjohnson/Desktop/genius_songs.csv')

In [61]:
spotify_songs.to_csv('/Users/jonjohnson/Desktop/spotify_songs.csv')

#### Song List Merge

In [57]:
genify = pd.merge(spotify_songs, genius_songs, how='left', left_on=['artist_name_n', 'song_title_n'], right_on=['g_artist_n', 'g_song_name_n'])

In [58]:
genify.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23888 entries, 0 to 23887
Data columns (total 14 columns):
album_release_date    23888 non-null object
artist_id             23888 non-null object
artist_name           23888 non-null object
duration_ms           23888 non-null float64
explicit              23888 non-null bool
linked_album          23888 non-null object
song_title            23888 non-null object
artist_name_n         23888 non-null object
song_title_n          23888 non-null object
g_song_name           14939 non-null object
g_artist              14939 non-null object
g_artist_id           14939 non-null float64
g_artist_n            14939 non-null object
g_song_name_n         14939 non-null object
dtypes: bool(1), float64(2), object(11)
memory usage: 2.6+ MB


In [59]:
genify.head(100)

Unnamed: 0,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title,artist_name_n,song_title_n,g_song_name,g_artist,g_artist_id,g_artist_n,g_song_name_n
0,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,200400.0,False,No Strings Attached,Bye Bye Bye,nsync,bye bye bye,,,,,
1,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,284760.0,False,No Strings Attached,This I Promise You,nsync,this i promise you,,,,,
2,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,192426.0,False,No Strings Attached,It's Gonna Be Me,nsync,it's gonna be me,,,,,
3,1997-05-26,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,211000.0,False,'N Sync,Tearin' up My Heart - Radio Edit,nsync,tearin' up my heart - radio edit,,,,,
4,2001-07-24,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,238426.0,False,Celebrity,Pop,nsync,pop,,,,,
5,1997-05-26,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,202106.0,False,'N Sync,I Want You Back - Radio Edit,nsync,i want you back - radio edit,,,,,
6,2005-10-25,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,243493.0,False,Greatest Hits,(God Must Have Spent) A Little More Time On Yo...,nsync,(god must have spent) a little more time on yo...,,,,,
7,2005-10-25,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,266293.0,False,Greatest Hits,This I Promise You - Radio Edit,nsync,this i promise you - radio edit,,,,,
8,2001-07-24,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,253600.0,False,Celebrity,Girlfriend,nsync,girlfriend,,,,,
9,2001-07-24,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,292000.0,False,Celebrity,Gone,nsync,gone,,,,,


In [25]:
genify.to_csv('/Users/jonjohnson/Desktop/genify.csv')

A ton of null values here. There should be far more matches.

Strategies for matching:
 - Pull out non-alphanumeric characters from artist names (+ potentially song names)

In [21]:
spotify_songs['artist_name'].apply(strip)

s_song_id
62bOmKYxYg7dhrC6gH9vFn                  NSYNC
46n2EGFnPC3tzWCN1Aqe26                  NSYNC
2AW37v0bDyuOzGP3XnmFuA                  NSYNC
594M0rqYMOo8BhMGEdoi5C                  NSYNC
0Jc8qF1mUPo1A96HE9QxZz                  NSYNC
5YTMRAT4yKgFrepF8Hi3mY                  NSYNC
72otaqywVqwyXaCjk75JKm                  NSYNC
1JbzBbwkf93dii20EC3EiZ                  NSYNC
6u5flhVFxKZrl9AApvf2SL                  NSYNC
4CCUjYJPbSXLL23BFeBVbI                  NSYNC
60R2v9lheAu3lwZwAFxMZK    "Weird Al" Yankovic
5r96TaQquRrlo3Ym3ZlSL2    "Weird Al" Yankovic
1SGnWl33MUNd9QHYAoqJtW    "Weird Al" Yankovic
5eZaT21ZVGyGHJ8kcwaNxA    "Weird Al" Yankovic
4JqQWAr47pGEoaMArpA7Z3    "Weird Al" Yankovic
1nXCcO9Fp1mE0rzw39qOPY    "Weird Al" Yankovic
74sUbOF9Zm8LdGUJjxleTl    "Weird Al" Yankovic
0WJTdVboKc2KI1DzzAWyYM    "Weird Al" Yankovic
4ZJGobiy4ayWSdKfoqMRlX    "Weird Al" Yankovic
1ZqKCseNe8FIZRDyl2E2Ac    "Weird Al" Yankovic
3pO37BXsjMC2wApALxGbuB               10 Years
0uyDAijTR0tOuH24hxDhE5  