# Matching Credits to Song Data

In [90]:
import json
import numpy as np
import pandas as pd
import pickle
import re
from sqlalchemy import create_engine
import time

In [2]:
with open('../data/genius_song_details_1.json', 'r') as f:
    genius_song_details_1 = json.load(f)

In [None]:
with open('../data/genius_song_details_2.json', 'r') as f:
    genius_song_details_2 = json.load(f)

In [None]:
with open('../data/genius_song_details_3.json', 'r') as f:
    genius_song_details_3 = json.load(f)

In [None]:
with open('../data/genius_song_details_4.json', 'r') as f:
    genius_song_details_4 = json.load(f)

In [3]:
with open('../data/genius_song_details_5.json', 'r') as f:
    genius_song_details_5 = json.load(f)

In [4]:
with open('../data/genius_song_details_6.json', 'r') as f:
    genius_song_details_6 = json.load(f)

In [96]:
with open('../data/genius_song_details_7.json', 'r') as f:
    genius_song_details_7 = json.load(f)

In [133]:
with open('../data/genius_song_details_8.json', 'r') as f:
    genius_song_details_8 = json.load(f)

### What Information do I Want to Model on?

For sure I need `writer_artists`, but beyond that, I don't know.

In [82]:
# genius_song_details_1[0]['response']['song']['writer_artists']                       #for songwriter info
# genius_song_details_1[0]['response']['song']['writer_artists'][0]['name']            #for writer name
# genius_song_details_1[0]['response']['song']['writer_artists'][0]['id']              #for writer id
# genius_song_details_1[0]['response']['song']['id']                                   #for song id
# genius_song_details_1[0]['response']['song']['title']                                #for song title
# genius_song_details_1[0]['response']['song']['full_title']                           #for full song title
# genius_song_details_1[0]['response']['song']['producer_artists']                     #for credited producer
# genius_song_details_1[0]['response']['song']['primary_artist']                       #for primary artist details
# genius_song_details_1[0]['response']['song']['primary_artist']['name']               #for primary artist name
# genius_song_details_1[0]['response']['song']['primary_artist']['id']                 #for primary artist id

1

### How Many Writers are on a Song?

In [3]:
def writer_counter(song_details_list):
    count_list = []
    for num, entry in enumerate(song_details_list):
        if isinstance(entry, dict):
            if 'response' in entry.keys():
                count_list.append((len(entry['response']['song']['writer_artists']),  num))
    return max(count_list)

In [2]:
writer_counter(genius_song_details_4)

A max of 70(!) writers per song is going to give me a lot of columns in this combined table

### Tossing Song Details into DataFrame

I'm going to end up creating three separate tables, which will all eventually make their way unto psql:
- `genius_songwriters` - A DataFrame with every songwriter + songwriter id
- `genius_song_details` - A DataFrame with a row for every song, inclusive of song title, artist, and requsite id's
- `genius_song_details_writers` - A DataFrame listing each writer that appears on a given song

#### Genius Songwriters (Table 1)

##### Parsing Songwriters into Separate Lists

In [5]:
def songwriters_tolist(song_details):
    '''
    Take Genius song details listing, and extract each writer associated with a song id into a new list
    '''
    writer_list = []
    for entry in song_details:
        if isinstance(entry, dict):
            if 'response' in entry.keys():
                g_song_id = entry['response']['song']['id']
                for writer in entry['response']['song']['writer_artists']:
                    writer_list.append({'g_song_id':g_song_id, 'writer_name':writer['name'], 'writer_id':writer['id']})
    return writer_list

##### `genius_song_details_1`

In [72]:
genius_songwriters_1 = songwriters_tolist(genius_song_details_1)

In [73]:
songwriter_df_1 = pd.DataFrame(genius_songwriters_1)
songwriter_df_1.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,1,27663,The Heatmakerz
1,1,1,Cam’ron
2,3,9768,Irv Gotti
3,3,644832,H. Davis
4,3,214470,B. Bacharach


In [7]:
songwriter_df_1.shape

(12715, 3)

##### `genius_song_details_2`

In [8]:
genius_songwriters_2 = songwriters_tolist(genius_song_details_2)

In [9]:
songwriter_df_2 = pd.DataFrame(genius_songwriters_2)
songwriter_df_2.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,8776,27765,Salaam Remi
1,8776,56,Nas
2,8776,81,AZ
3,8785,998,B-Real
4,8874,170,Kurupt


`genius_song_details_3`

In [56]:
genius_songwriters_3 = songwriters_tolist(genius_song_details_3)

In [85]:
songwriter_df_3 = pd.DataFrame(genius_songwriters_3)
songwriter_df_3.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,149508,9536,Jimmy Page
1,149508,12999,Robert Plant
2,149625,1080,Walt Whitman
3,149629,85,T.I.
4,149629,853,B.o.B


In [44]:
songwriter_df_3.to_csv('../data/genius_songwriters_3.csv')

`genius_song_details_4`

In [11]:
genius_songwriters_4 = songwriters_tolist(genius_song_details_4)

In [86]:
songwriter_df_4 = pd.DataFrame(genius_songwriters_4)
songwriter_df_4.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,436873,660,Prince
1,436877,660,Prince
2,436879,660,Prince
3,436882,660,Prince
4,436888,660,Prince


In [15]:
songwriter_df_4.to_csv('../data/genius_songwriters_4.csv')

##### `genius_song_details_5`

In [6]:
genius_songwriters_5 = songwriters_tolist(genius_song_details_5)

In [8]:
songwriter_df_5 = pd.DataFrame(genius_songwriters_5)
songwriter_df_5.head(2)

Unnamed: 0,g_song_id,writer_id,writer_name
0,872360,35893,Gary Richrath
1,872363,157416,Matthew Thiessen


##### `genius_song_details_6`

In [7]:
genius_songwriters_6 = songwriters_tolist(genius_song_details_6)

In [9]:
songwriter_df_6 = pd.DataFrame(genius_songwriters_6)
songwriter_df_6.head(2)

Unnamed: 0,g_song_id,writer_id,writer_name
0,1216294,39170,Conor Oberst
1,1216294,598,Neil Young


##### `genius_song_details_7`

In [97]:
genius_songwriters_7 = songwriters_tolist(genius_song_details_7)

In [99]:
songwriter_df_7 = pd.DataFrame(genius_songwriters_7)

In [100]:
songwriter_df_7.head(2)

Unnamed: 0,g_song_id,writer_id,writer_name
0,1559503,567806,Tim Mensy
1,1559503,477987,Tony Haselden


##### `genius_song_details_8`

In [134]:
genius_songwriters_8 = songwriters_tolist(genius_song_details_8)
songwriter_df_8 = pd.DataFrame(genius_songwriters_8)
songwriter_df_8.head(2)

Unnamed: 0,g_song_id,writer_id,writer_name
0,1972199,370602,Tony Levin
1,1972199,125323,John Myung


#### Genius Song Details Writers (Table 2)

##### Flattening Songwriter Records

In [9]:
def flat_songwriter_df(songwriters):
    '''
    Flatten songwriter listing for merge back into song details DataFrame
    '''
    songwriters.set_index('g_song_id', inplace=True)
    songwriters.writer_id = songwriters.writer_id.astype('object')
    songwriters_new = songwriters.drop('writer_name', 1)
    dummies = pd.get_dummies(songwriters_new).reset_index().groupby('g_song_id').sum()
    return dummies

#### Concatenating Songwriter Listings & Saving as csv

In [15]:
songwriter_df = pd.concat([songwriter_df_1, songwriter_df_2])
songwriter_df.reset_index().drop('index', 1, inplace=True)
songwriter_df.to_csv('../data/genius_songwriters_1_2.csv')

#### Song Details per Song

In [103]:
def song_details_tolist(song_details):
    '''
    Take Genius song details listing, and extract all important metadata into a new list
    '''
    song_details_list = []
    for entry in song_details:
        if isinstance(entry, dict):
            if 'response' in entry.keys():
                song = entry['response']['song']
                song_details_list.append({'g_song_id':song['id'], 'g_song_title':song['title'], 'g_full_song_title':song['full_title'],
                                          'g_artist_name':song['primary_artist']['name'], 'g_artist_id':song['primary_artist']['id']})
    return song_details_list

##### `genius_song_details_1`

In [68]:
genius_song_details_1_list = song_details_tolist(genius_song_details_1)

In [69]:
song_details_df_1 = pd.DataFrame(genius_song_details_1_list)
song_details_df_1.set_index('g_song_id', inplace = True)

In [17]:
# song_details_df_1.head(20)

##### `genius_song_details_2`

In [171]:
genius_song_details_2_list = song_details_tolist(genius_song_details_2)

In [172]:
song_details_df_2 = pd.DataFrame(genius_song_details_2_list)
song_details_df_2.set_index('g_song_id', inplace = True)

#### Concatenating Song Detail Listings & Saving as csv

In [181]:
song_details_df = pd.concat([song_details_df_1, song_details_df_2])
song_details_df.to_csv('../data/genius_song_details_1_2.csv')

##### `genius_song_details_3`

In [45]:
genius_song_details_3_list = song_details_tolist(genius_song_details_3)
song_details_df_3 = pd.DataFrame(genius_song_details_3_list).set_index('g_song_id')
song_details_df_3.shape

(49999, 4)

In [43]:
song_details_df_3.to_csv('../data/genius_song_details_3.csv')

##### `genius_song_details_4`

In [105]:
# genius_song_details_4_list = song_details_tolist(genius_song_details_4)
# song_details_df_4 = pd.DataFrame(genius_song_details_4_list).set_index('g_song_id')
# song_details_df_4.shape

In [106]:
song_details_df_4 = pd.read_csv('../data/genius_song_details_4.csv', index_col = 'g_song_id')

In [107]:
song_details_df_4.head()

Unnamed: 0_level_0,g_artist_id,g_artist_name,g_full_song_title,g_song_title
g_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
436873,660,Prince,The Holy River by Prince,The Holy River
436874,308,Naughty By Nature,What U Wanna Do by Naughty By Nature (Ft. P!nk),What U Wanna Do
436877,660,Prince,Let's Have A Baby by Prince,Let's Have A Baby
436878,16420,Mic the Microphone,Been Dreamin' by Mic the Microphone,Been Dreamin'
436879,660,Prince,Saviour by Prince,Saviour


## Attempting to Merge

### Merging Song Lists

I'm going to first see how merging the `genius_song_list.csv` and `spotify_song_list.csv` goes. My success there will dictate how merging the actual song records goes.

##### Mashing all of the Songwriter Tables that I Currently Have

In [10]:
songwriter_df_1_2 = pd.read_csv('../data/genius_songwriters_1_2.csv').drop(labels='Unnamed: 0', axis= 1)

In [11]:
songwriter_df_3 = pd.read_csv('../data/genius_songwriters_3.csv').drop(labels='Unnamed: 0', axis= 1)
songwriter_df_4 = pd.read_csv('../data/genius_songwriters_4.csv').drop(labels='Unnamed: 0', axis= 1)

In [7]:
songwriter_df_1_2.head(2)

Unnamed: 0,g_song_id,writer_id,writer_name
0,1,27663,The Heatmakerz
1,1,1,Cam’ron


In [135]:
songwriter_df = pd.concat([songwriter_df_1_2,
                           songwriter_df_3,
                           songwriter_df_4,
                           songwriter_df_5,
                           songwriter_df_6,
                           songwriter_df_7,
                           songwriter_df_8])

In [136]:
songwriter_df.shape

(309237, 3)

In [137]:
songs_w_writers = songwriter_df.g_song_id.unique()

In [138]:
with open('../pickle/songs_w_writers.pkl', 'wb')  as f:
    pickle.dump(songs_w_writers, f)

#### Retrieving Spotify & Genius song lists

In [20]:
engine = create_engine('postgresql://postgres:glide-mortuary-pod-cloy-belong@ec2-54-244-70-11.us-west-2.compute.amazonaws.com:5432/postgres')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x189bfa3c8>

In [139]:
spotify_songs = pd.read_sql('SELECT * FROM spotify_song_list', con=engine, index_col='s_song_id')
genius_songs = pd.read_sql('SELECT DISTINCT * FROM genius_song_list', con=engine, index_col='g_song_id')

In [140]:
genius_songs.shape, spotify_songs.shape

((410121, 3), (23888, 7))

In [141]:
genius_songs.reset_index(inplace=True)

#### Adding `songs_w_writers` to `genius_songs`

In [142]:
genius_songs['writers'] = genius_songs['g_song_id'].apply(lambda x: 1 if x in songs_w_writers else 0)

In [143]:
genius_songs.writers.sum()

139346

So far, roughly 1/3 of my songs have corresponding writer information

#### Creating non-formatted Genius Columns

In [34]:
strip = lambda x: x.strip("'/*")

In [144]:
genius_songs['g_artist_n'] = genius_songs['g_artist'].apply(lambda x: str(x).lower()).apply(lambda x: str(x).strip("''/*")) 
genius_songs['g_song_name_n'] = genius_songs['g_song_name'].apply(lambda x: str(x).lower())

##### Removing "feat." language in song titles

In [145]:
genius_songs['g_song_name_n'] = genius_songs['g_song_name_n'].apply(lambda x: re.sub(r'(\(feat.*)','', x))

##### Sanity check

In [146]:
genius_songs[genius_songs['g_song_name_n'] == 'both eyes closed']

Unnamed: 0,g_song_id,g_song_name,g_artist,g_artist_id,writers,g_artist_n,g_song_name_n
365167,3099481,Both Eyes Closed,Gucci Mane,13,0,gucci mane,both eyes closed


#### Creating non-formatted Spotify Columns

In [147]:
spotify_songs['artist_name_n'] = spotify_songs['artist_name'].apply(lambda x: str(x).lower()).apply(lambda x: str(x).strip("''/*")) 
spotify_songs['song_title_n'] = spotify_songs['song_title'].apply(lambda x: str(x).lower())

In [148]:
spotify_songs['song_title_n'] = spotify_songs['song_title_n'].apply(lambda x: re.sub(r'(\(feat.*)','', x))

In [149]:
spotify_songs.head()

Unnamed: 0_level_0,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title,artist_name_n,song_title_n
s_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
62bOmKYxYg7dhrC6gH9vFn,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,200400.0,False,No Strings Attached,Bye Bye Bye,nsync,bye bye bye
46n2EGFnPC3tzWCN1Aqe26,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,284760.0,False,No Strings Attached,This I Promise You,nsync,this i promise you
2AW37v0bDyuOzGP3XnmFuA,2000-03-21,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,192426.0,False,No Strings Attached,It's Gonna Be Me,nsync,it's gonna be me
594M0rqYMOo8BhMGEdoi5C,1997-05-26,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,211000.0,False,'N Sync,Tearin' up My Heart - Radio Edit,nsync,tearin' up my heart - radio edit
0Jc8qF1mUPo1A96HE9QxZz,2001-07-24,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,238426.0,False,Celebrity,Pop,nsync,pop


In [150]:
spotify_songs.reset_index(inplace=True)

In [169]:
spotify_songs.shape

(23888, 10)

In [170]:
spotify_songs.drop_duplicates(inplace=True)
spotify_songs.shape

(23129, 10)

#### Checking my DF's in Excel to see What Other Regex Would be Best

In [151]:
genius_songs.to_csv('/Users/jonjohnson/Desktop/genius_songs.csv')

In [171]:
spotify_songs.to_csv('/Users/jonjohnson/Desktop/spotify_songs.csv')

#### Song List Merge

In [172]:
genify = pd.merge(spotify_songs, genius_songs, how='left', left_on=['artist_name_n', 'song_title_n'], right_on=['g_artist_n', 'g_song_name_n'])

In [173]:
genify.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23129 entries, 0 to 23128
Data columns (total 17 columns):
s_song_id             23129 non-null object
album_release_date    23129 non-null object
artist_id             23129 non-null object
artist_name           23129 non-null object
duration_ms           23129 non-null float64
explicit              23129 non-null bool
linked_album          23129 non-null object
song_title            23129 non-null object
artist_name_n         23129 non-null object
song_title_n          23129 non-null object
g_song_id             14603 non-null float64
g_song_name           14603 non-null object
g_artist              14603 non-null object
g_artist_id           14603 non-null float64
writers               14603 non-null float64
g_artist_n            14603 non-null object
g_song_name_n         14603 non-null object
dtypes: bool(1), float64(4), object(12)
memory usage: 3.0+ MB


In [185]:
genify_matches = genify[genify['writers']== 1][genify['g_song_id'].notna()]

  """Entry point for launching an IPython kernel.


In [187]:
genify_matches.shape

(8084, 17)

Out of the ~14603 current matches, only 8084 are not null (~55%)

#### Other Questions to Answer:

- Out of these matches, how many writers make up the songs?
- What is the breakdown of # of songs for each of those writers?

#### Potential New Directions for Project:

- Analysis of most prolific songwriters of past ## years, based on data collected
- Cluster writers based on what they've done, especially "raw talent" or new songwriters, and who maybe they most resemble
  - Make suggestions for what they can do to further their career

##### Number of Songwriters Per Song Matched

###### Average Number

In [68]:
project_songwriters.g_song_id.value_counts().mean()

2.8636363636363638

##### Number of Songwriters & Number of Songs per Songwriter

In [78]:
songs_per_writer = project_songwriters.writer_name.value_counts()

In [82]:
len(songs_per_writer)

2305

There's a total of 2305 songwriters that I have currently.

In [83]:
len(songs_per_writer[songs_per_writer < 4])

1926

Out of all of those songwriters, 1926 of them have written on less than 4 works. That leaves 379 "prolific" writers.

In [176]:
genify.to_csv('../data/genify.csv')

In [125]:
songwriter_df.head()

Unnamed: 0,g_song_id,writer_id,writer_name
0,1,27663,The Heatmakerz
1,1,1,Cam’ron
2,3,9768,Irv Gotti
3,3,644832,H. Davis
4,3,214470,B. Bacharach


In [208]:
songwriter_df.to_csv('../data/songwriter_df.csv')

In [188]:
songwriter_df['for_project'] = songwriter_df['g_song_id'].apply(lambda x: x in genify_matches['g_song_id'])

In [189]:
project_songwriters = songwriter_df[songwriter_df['for_project'] == True]

##### Number of Songwriters Per Song Matched

###### Average Number

In [192]:
project_songwriters.g_song_id.value_counts().mean()

2.8268923666560206

##### Number of Songwriters & Number of Songs per Songwriter

In [201]:
project_songwriters['for_project']

True    8851
Name: for_project, dtype: int64

In [193]:
songs_per_writer = project_songwriters.writer_name.value_counts()

In [194]:
len(songs_per_writer)

2486

There's a total of 2486 songwriters that I have currently.

In [198]:
len(songs_per_writer[songs_per_writer < 4])

2065

In [199]:
songs_per_writer[songs_per_writer > 4].head()

Eminem              130
2Pac                123
Lil Wayne           113
Ghostface Killah     90
JAY-Z                88
Name: writer_name, dtype: int64

Out of all of those songwriters, 2065 of them have written on less than 4 works. That leaves 421 "prolific" writers.

#### Analysis of Matches So Far

A ton of null values here. There should be far more matches.

Strategies for matching:
 - Pull out non-alphanumeric characters from artist names (+ potentially song names)

In [21]:
spotify_songs['artist_name'].apply(strip)

s_song_id
62bOmKYxYg7dhrC6gH9vFn                  NSYNC
46n2EGFnPC3tzWCN1Aqe26                  NSYNC
2AW37v0bDyuOzGP3XnmFuA                  NSYNC
594M0rqYMOo8BhMGEdoi5C                  NSYNC
0Jc8qF1mUPo1A96HE9QxZz                  NSYNC
5YTMRAT4yKgFrepF8Hi3mY                  NSYNC
72otaqywVqwyXaCjk75JKm                  NSYNC
1JbzBbwkf93dii20EC3EiZ                  NSYNC
6u5flhVFxKZrl9AApvf2SL                  NSYNC
4CCUjYJPbSXLL23BFeBVbI                  NSYNC
60R2v9lheAu3lwZwAFxMZK    "Weird Al" Yankovic
5r96TaQquRrlo3Ym3ZlSL2    "Weird Al" Yankovic
1SGnWl33MUNd9QHYAoqJtW    "Weird Al" Yankovic
5eZaT21ZVGyGHJ8kcwaNxA    "Weird Al" Yankovic
4JqQWAr47pGEoaMArpA7Z3    "Weird Al" Yankovic
1nXCcO9Fp1mE0rzw39qOPY    "Weird Al" Yankovic
74sUbOF9Zm8LdGUJjxleTl    "Weird Al" Yankovic
0WJTdVboKc2KI1DzzAWyYM    "Weird Al" Yankovic
4ZJGobiy4ayWSdKfoqMRlX    "Weird Al" Yankovic
1ZqKCseNe8FIZRDyl2E2Ac    "Weird Al" Yankovic
3pO37BXsjMC2wApALxGbuB               10 Years
0uyDAijTR0tOuH24hxDhE5  