# Querying Genius API for Song Credits

In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import requests
import time
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor, Json
from sqlalchemy import *

In [2]:
with open('../data/genius_ids.json', 'r') as f:
    genius_ids = json.load(f)

In [20]:
df = pd.read_csv('../data/spotify_artists_clean.csv', index_col=0)

In [23]:
df['name'].iloc[1]

'"Weird Al" Yankovic'

#### Gathering Artist IDs from `genius_ids`

In [2]:
# genius_ids[0]

In [12]:
# genius_ids[0] - for a given artist & their associated search details
# genius_ids[0]['*NSYNC']['response']['hits'] - the actual search results
# genius_ids[0]['*NSYNC']['response']['hits'][0]['result']['primary_artist'] - information for the artist
# genius_ids[0]['*NSYNC']['response']['hits'][0]['result']['primary_artist']['id'] - Genius Artist ID
# genius_ids[0]['*NSYNC']['response']['hits'][0]['result']['primary_artist']['name'] - Genius Artist name

{'api_path': '/artists/8625',
 'header_image_url': 'https://s3.amazonaws.com/rapgenius/1362260617_51058047.png',
 'id': 8625,
 'image_url': 'https://s3.amazonaws.com/rapgenius/1362260617_51058047.png',
 'is_meme_verified': False,
 'is_verified': False,
 'name': "'N Sync",
 'url': 'https://genius.com/artists/N-sync'}

In [120]:
genius_artist_ids = []
i = -1

for artist in genius_ids:
    ar = {}
    i += 1
    for result in artist[df['name'].iloc[i]]['response']['hits']:
        ar[result['result']['primary_artist']['name']] = result['result']['primary_artist']['id']
    genius_artist_ids.append(dict({df['name'].iloc[i] : ar}))

In [121]:
genius_artist_ids[0]

{'*NSYNC': {"'N Sync": 8625}}

In [63]:
len(list(genius_artist_ids[2].values())[0].keys())

4

##### Dumping `genius_artist_ids` into json file

In [35]:
with open('../data/genius_artist_ids.json', 'w') as f:
    json.dump(genius_artist_ids, f)

I'm really not sure what the best strategy for retrieving all of the songs I need, though I'm considering a scorched earth routine, where I just retrieve every song corresponding to every artist id I've retrieved.

In [14]:
with open('../data/genius_artist_ids.json', 'r') as f:
    genius_artist_ids = json.load(f)

In [26]:
len(genius_artist_ids)

2449

##### Looking at Artists -w- More Than One Result

In [15]:
multiple_ids = []

for artist in genius_artist_ids:
    if len(list(artist.values())[0].keys()) > 1:
        multiple_ids.append(artist)

There are a shit ton of multiple results. I wonder what is the best way to filter through all of these.

In [16]:
multiple_ids[:5]

[{'10 Years': {'10 Years': 89600,
   'Tennis': 11394,
   'Bullet For My Valentine': 37666,
   'Travis Mendes': 346989}},
 {'112': {'112': 504, 'Jay Rock': 1403, 'Slim of 112': 665236}},
 {'2 Chainz': {'2 Chainz': 14325,
   '2 Chainz & Wiz Khalifa': 51112,
   'ScHoolboy Q, 2 Chainz & Saudi': 1357232}},
 {'21 Savage': {'21 Savage': 430404,
   '21 Savage & Metro Boomin': 980465,
   '21 Savage, Offset & Metro Boomin': 1249916}},
 {'311': {'311': 10440, 'Emily Dickinson': 717, 'Wale': 396}}]

In [17]:
multiple_ids_2 = []

for artist in multiple_ids:
    for k, v in artist.items():
        for key, value in v.items():
            if k == key:
                multiple_ids_2.append(dict({k : dict({key:value})}))

In [18]:
len(multiple_ids_2)

793

In [19]:
multiple_ids_2[:5]

[{'10 Years': {'10 Years': 89600}},
 {'112': {'112': 504}},
 {'2 Chainz': {'2 Chainz': 14325}},
 {'21 Savage': {'21 Savage': 430404}},
 {'311': {'311': 10440}}]

##### Looking at Length of all the Genius ID files

In [10]:
len(genius_artist_ids), len(multiple_ids), len(multiple_ids_2)

(2449, 1058, 793)

##### Removing Result from `genius_artist_ids` if it's part of `multiple_ids`

In [128]:
for artist in multiple_ids:
    if artist in genius_artist_ids:
        genius_artist_ids.remove(artist)

In [130]:
len(genius_artist_ids)

1391

##### Adding the Filtered `multiple_ids_2` results back into `genius_artist_ids`

In [131]:
for artist in multiple_ids_2:
    genius_artist_ids.append(artist)

In [133]:
len(genius_artist_ids)

2184

##### Dumping singluar `genius_artist_ids` into json

In [152]:
with open('../data/genius_artist_ids_single.json', 'w') as f:
    json.dump(genius_artist_ids, f)

##### Retrieving Old `genius_artist_ids` listing to Filter Down `multiple_ids` Results

In [13]:
with open('../data/genius_artist_ids.json', 'r') as f:
    genius_artist_ids = json.load(f)

In [3]:
len(genius_artist_ids)

2449

In [11]:
len(multiple_ids)

1058

In [20]:
filtered_ids = [list(row.keys())[0] for row in multiple_ids_2]
len(filtered_ids)

793

In [29]:
for artist in multiple_ids:
    if list(artist.keys())[0] in filtered_ids:
        multiple_ids.remove(artist)

In [30]:
len(multiple_ids)

265

In [25]:
with open('../pickle/multiple_genius_artist_ids.pkl', 'wb+') as f:
    pickle.dump(multiple_ids, f)

In [47]:
list(multiple_ids[0].keys())[0]

'4 Hands 4 Him'

##### Verdict On Multple ID entries

I can't seem to accurately remove all of the entries I need to...therefore, I'll run through the single entry IDs first, and go scorched earth on the other ones afterwards.

##### Retrieving `genius_artist_ids_single` to Query Genius API

In [22]:
with open('../data/genius_artist_ids_single.json', 'r') as f:
    genius_artist_ids_single = json.load(f)

In [23]:
len(genius_artist_ids_single)

2184

My first pass-through of the Genius API song ids will be for this many songs

### Querying Genius API with all artist_ids

In [4]:
list(list(genius_artist_ids_single[0].values())[0].values())[0]

8625

In [36]:
access_token = '&access_token=IwvfcN2fO8fNm5ufyApn1c8Z9X3Nud-Z6wJhTeKocdLDR_Jtj6bW6POwfT81u6Of'

In [5]:
with open('../data/genius_song_ids.json', 'w') as f:
    json.dump([], f)

In [6]:
song_ids = []

In [7]:
'artists/16775/songs?sort=popularity&per_page=50&page=2'

'artists/16775/songs?sort=popularity&per_page=50&page=2'

In [18]:
genius_artist_ids_single[353]

{'DJ Magic Mike and DJ Magic Mike and the Royal Possethe Royal Posse': {}}

#### Single Artist ID Search

In [30]:
song_ids = []

for artist in genius_artist_ids_single:
    page_no = 1
    try:
        url = 'https://api.genius.com/artists/' + str(list(list(artist.values())[0].values())[0]) + '/songs/?per_page=50&page=' + str(page_no) + access_token
    except:
        continue
    try:
        r = requests.get(url)
        song_ids.append(r.json())
    except:
        time.sleep(1)
        r = requests.get(url)
        song_ids.append(r.json())     
    while isinstance(r.json()['response']['next_page'], int):
        page_no += 1
        url = 'https://api.genius.com/artists/' + str(list(list(artist.values())[0].values())[0]) + '/songs/?per_page=50&page=' + str(page_no) + access_token
        try:
            r = requests.get(url)
            song_ids.append(r.json())
        except:
            time.sleep(1)
            r = requests.get(url)
            song_ids.append(r.json())
with open('../data/genius_song_ids.json', mode='w', encoding='utf-8') as f:
    json.dump(song_ids, f)   

##### Ran into an error, (list index out of range)...need to fix

In [10]:
len(song_ids)

765

I'm including the first `try / except` because some of my singular id entries don't actually have an id. This seems strange. I'll have to investigate this after my loop is done processing.

In [29]:
genius_artist_ids_single[353]

{'DJ Magic Mike and DJ Magic Mike and the Royal Possethe Royal Posse': {}}

In [31]:
len(song_ids)

2178

After going through the list of artist ids, I lost about 6 entries. I'm going to forego looking further into this until I actually grab all the songs I need.

##### Grabbing Remaining Song IDs

In [30]:
multiple_ids[0]['4 Hands 4 Him']

{'Elijah Muhammad': 17108,
 'Mavis Staples': 1179,
 'Holy Bible (KJV)': 31651,
 'Simeon Wiley': 1159108,
 'Genius Transcribers': 1001987,
 'Country Genius': 214055,
 'Genius Users': 226635,
 'Vesuvius': 114359,
 'Johann Wolfgang von Goethe': 54936}

In [28]:
multiple_ids[0]['4 Hands 4 Him']['Elijah Muhammad']

17108

In [34]:
url = 'https://api.genius.com/artists/1179/songs/?sortsort=popularity&per_page=50' + access_token

In [3]:
# r = requests.get(url)
# # r.text

In [52]:
multiple_ids[:3]

[{'4 Hands 4 Him': {'Elijah Muhammad': 17108,
   'Mavis Staples': 1179,
   'Holy Bible (KJV)': 31651,
   'Simeon Wiley': 1159108,
   'Genius Transcribers': 1001987,
   'Country Genius': 214055,
   'Genius Users': 226635,
   'Vesuvius': 114359,
   'Johann Wolfgang von Goethe': 54936}},
 {'8 Ball & MJG & Mr. E of RPS Fam': {'N.W.A': 974,
   'Riley Reid': 641108,
   '8Ball & MJG': 1858,
   '8Ball': 4168}},
 {'98º': {'Public Enemy': 203,
   'Viru$ Weed': 494830,
   'Julian Sem Modos': 998542,
   '98°': 151905,
   'Big Pun': 161,
   'Big L': 103,
   'Baddiel, Skinner & The Lightning Seeds': 1511669,
   'JAY-Z': 2,
   'Boulcy': 19724}}]

In [35]:
for artist in multiple_ids[:5]:
    for k, v in artist.items():
        for key, val in v.items():
            print(str(val))

17108
1179
31651
1159108
1001987
214055
226635
114359
54936
974
641108
1858
4168
203
494830
998542
151905
161
103
1511669
2
19724
639521
1130142
38586
158709
64610
209028
181
586
24422
547449
25320
257


#### Multiple Artist ID Search

In [51]:
song_ids = []

for artist in multiple_ids[133:] :
    for k, v in artist.items():
        for key, val in v.items():
            try:
                page_no = 1
                url = 'https://api.genius.com/artists/' + str(val) + '/songs/?per_page=50&page=' + str(page_no) + access_token
            except:
                continue
            try:
                r = requests.get(url)
                song_ids.append(r.json())
            except:
                time.sleep(1)
                r = requests.get(url)
                song_ids.append(r.json())
            while isinstance(r.json()['response']['next_page'], int):
                page_no += 1
                url = 'https://api.genius.com/artists/' + str(val) + '/songs/?per_page=50&page=' + str(page_no) + access_token
                try:
                    r = requests.get(url)
                    song_ids.append(r.json())
                except:
                    time.sleep(1)
                    r = requests.get(url)
                    song_ids.append(r.json())
with open('../data/genius_song_ids_3.json', 'w') as f:
    json.dump(song_ids, f)

In [53]:
len(song_ids)

3351

In [2]:
# song_ids

The first time searching through `multiple_ids` only got through the first 82 records. I've saved them in `genius_song_ids_2.json`. I'm searching through the multiple ids a second time, just from index 83 onwards, and saving it into `genius_song_ids_3.json`.

In [29]:
fun_index = next((index for (index, d) in enumerate(multiple_ids) if list(d.keys())[0] == 'fun.'), None)
fun_index

82

In [33]:
multiple_ids[-1]

{'U.S.A. For Africa': {'USA For Africa': 370890,
  'Michael Jackson': 835,
  'Fernando Pereira': 533817,
  'Boogie Down Productions': 815,
  'Genius': 204611,
  'Sam Kinison': 381016,
  'TTC': 11471,
  'Bryan Adams': 32823,
  'Dead Kennedys': 19853}}

#### Merging all of the Song ID json files

In [2]:
with open('../data/genius_song_ids.json', 'r') as f:
    genius_song_ids = json.load(f)
with open('../data/genius_song_ids_2.json', 'r') as f:
    genius_song_ids_2 = json.load(f)
with open('../data/genius_song_ids_3.json', 'r') as f:
    genius_song_ids_3 = json.load(f)

In [12]:
type(genius_song_ids_2[0]['response']['next_page'])

NoneType

In [63]:
sum([len(genius_song_ids), len(genius_song_ids_2), len(genius_song_ids_3)])

3702

Combined length of all the files...just want to make sure that nothing gets screwed up in the merger.

In [3]:
genius_song_ids.extend(genius_song_ids_2)
genius_song_ids.extend(genius_song_ids_3)

In [6]:
len(genius_song_ids) # this was the full genius_song_ids

15826

Ending list looks great

In [58]:
with open('../data/genius_song_ids_full.json', 'w') as f:
    json.dump(genius_song_ids_full, f)

In [6]:
with open('../data/genius_song_ids_full.json', 'r') as f:
    genius_song_ids_full = json.load(f)

##### `genius_song_ids` heirarchy

In [7]:
# genius_song_ids_full[-1]                                                     inidividual search result for artist
# genius_song_ids_full[-1]['response']['songs']                                listing of songs for each search
# genius_song_ids_full[-1]['response']['songs'][0]                             individual song record

# will save the following into a separate master list to query songs:

# genius_song_ids_full[-1]['response']['songs'][0]['id']                        song id
# genius_song_ids_full[-1]['response']['songs'][0]['title']                     song title
# genius_song_ids_full[-1]['response']['songs'][0]['primary_artist']['name']    artist name

genius_song_ids[-1]['response']['songs'][0]

{'annotation_count': 1,
 'api_path': '/songs/1532655',
 'full_title': 'Lie Detector by\xa0Dead\xa0Kennedys',
 'header_image_thumbnail_url': 'https://images.genius.com/c62cba45fb8816f0da545a0a08177ab5.300x300x1.jpg',
 'header_image_url': 'https://images.genius.com/c62cba45fb8816f0da545a0a08177ab5.1000x1000x1.jpg',
 'id': 1532655,
 'lyrics_owner_id': 1549345,
 'lyrics_state': 'complete',
 'path': '/Dead-kennedys-lie-detector-lyrics',
 'pyongs_count': None,
 'song_art_image_thumbnail_url': 'https://images.genius.com/c62cba45fb8816f0da545a0a08177ab5.300x300x1.jpg',
 'stats': {'hot': False, 'unreviewed_annotations': 0},
 'title': 'Lie Detector',
 'title_with_featured': 'Lie Detector',
 'url': 'https://genius.com/Dead-kennedys-lie-detector-lyrics',
 'primary_artist': {'api_path': '/artists/19853',
  'header_image_url': 'https://images.genius.com/41c71a26cd2e2d9f904b8944171c3cb5.1000x508x1.jpg',
  'id': 19853,
  'image_url': 'https://images.genius.com/cc5ef4b1982a7351ba7e0aff1578b7fe.300x300x

##### Creating Song List to Easily Query Genius API with

In [8]:
genius_song_list = []

In [9]:
for record in genius_song_ids:
    for songs in record['response']['songs']:
        genius_song_list.append((songs['id'],
                                 songs['title'],
                                 songs['primary_artist']['name']))

In [10]:
len(genius_song_list)

670164

In [11]:
with open('../pickle/genius_song_list.pkl', 'wb+') as f:
    pickle.dump(genius_song_list, f)

This is the total number of songs I'm going to query the Genius API for underlying details on

#### Testing Song ID Against Genius Song Search

In [12]:
url_test = 'https://api.genius.com/songs/389'
headers = {'Authorization':'Bearer IwvfcN2fO8fNm5ufyApn1c8Z9X3Nud-Z6wJhTeKocdLDR_Jtj6bW6POwfT81u6Of'}

In [13]:
r_test = requests.get(url_test,
                      headers=headers)
test_json = r_test.json()

In [15]:
test_json['response']['song']['description'] # can remove this
test_json['response']['song']['current_user_metadata'] # can remove this
test_json['response']['song']['description_annotation'] # can remove this

{'_type': 'referent',
 'annotator_id': 7,
 'annotator_login': 'Maboo',
 'api_path': '/referents/3490126',
 'classification': 'accepted',
 'fragment': '2 of Amerikaz Most Wanted',
 'id': 3490126,
 'is_description': True,
 'path': '/3490126/2pac-2-of-amerikaz-most-wanted/2-of-amerikaz-most-wanted',
 'range': {'content': '2 of Amerikaz Most Wanted'},
 'song_id': 389,
 'url': 'https://genius.com/3490126/2pac-2-of-amerikaz-most-wanted/2-of-amerikaz-most-wanted',
 'verified_annotator_ids': [],
 'annotatable': {'api_path': '/songs/389',
  'client_timestamps': {'updated_by_human_at': 1537621545,
   'lyrics_updated_at': 1537621545},
  'context': '2Pac',
  'id': 389,
  'image_url': 'https://images.genius.com/e47c72fcb1f2756daafc49dc5167d814.1000x1000x1.jpg',
  'link_title': '2 of Amerikaz Most Wanted by\xa02Pac (Ft.\xa0Snoop\xa0Dogg)',
  'title': '2 of Amerikaz Most Wanted',
  'type': 'Song',
  'url': 'https://genius.com/2pac-2-of-amerikaz-most-wanted-lyrics'},
 'annotations': [{'api_path': '/an

In [78]:
# genius_song_list[15000:151628]

In [106]:
url = 'https://api.genius.com/songs/' + str(genius_song_list[0][0])

'https://api.genius.com/songs/1352728'

### Retrieving Song Details

In [2]:
headers = {'Authorization':'Bearer IwvfcN2fO8fNm5ufyApn1c8Z9X3Nud-Z6wJhTeKocdLDR_Jtj6bW6POwfT81u6Of'}

#### Using sqlalchemy + postgres

In [3]:
# DSN format for database connections:  [protocol / database  name]://[username]:[password]@[hostname / ip]:[port]/[database name here]
engine = create_engine('postgresql://postgres:glide-mortuary-pod-cloy-belong@ec2-54-244-70-11.us-west-2.compute.amazonaws.com:5432/postgres')

##### Verifying Connection

In [4]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f3826422fd0>

In [5]:
song_id_list = pd.read_sql("""
                            SELECT DISTINCT g_song_id FROM genius_song_list
                            """, con=engine)

In [6]:
for i in range(len(song_id_list.iloc[:5])):
    url = 'https://api.genius.com/songs/' + str(song_id_list.iloc[i].values[0])
    print(url)

https://api.genius.com/songs/1
https://api.genius.com/songs/3
https://api.genius.com/songs/4
https://api.genius.com/songs/5
https://api.genius.com/songs/6


In [7]:
song_id_list.shape

(410100, 1)

In [8]:
song_id_list.iloc[149999]

g_song_id    872354
Name: 149999, dtype: int64

In [10]:
with open('../data/genius_song_details_1.json', 'r') as f:
    genius_song_details_1 = json.load(f)

##### Separating Song Details into 10 Separate json's

In [None]:
song_details_5 = []

num = 149999
for i in range(len(song_id_list.iloc[150000:200000])):
    num += 1
    url = 'https://api.genius.com/songs/' + str(song_id_list.iloc[num].values[0])
    try:
        song_details_5.append(requests.get(url, headers=headers).json())
    except:
        time.sleep(1)
        song_details_5.append(requests.get(url, headers=headers).json())
    if num % 10000 == 0:
        print('finished pulling song_loc {}'.format(num))
with open('../data/genius_song_details_5.json', 'w') as f:
    json.dump(song_details_5, f)

finished pulling song_loc 150000
finished pulling song_loc 160000
finished pulling song_loc 170000


In [13]:
song_details_4[-1]

{'meta': {'status': 200},
 'response': {'song': {'annotation_count': 1,
   'api_path': '/songs/872354',
   'apple_music_id': None,
   'apple_music_player_url': None,
   'description': {'dom': {'tag': 'root',
     'children': [{'tag': 'p', 'children': ['?']}]}},
   'embed_content': "<div id='rg_embed_link_872354' class='rg_embed_link' data-song-id='872354'>Read <a href='https://genius.com/Bruce-springsteen-ballad-of-the-self-loading-pistol-lyrics'>“Ballad Of The Self-Loading Pistol” by Bruce\xa0Springsteen</a> on Genius</div> <script crossorigin src='//genius.com/songs/872354/embed.js'></script>",
   'featured_video': False,
   'full_title': 'Ballad Of The Self-Loading Pistol by\xa0Bruce\xa0Springsteen',
   'header_image_thumbnail_url': 'https://images.genius.com/d8d4a9abab290dea00e9e03bdc40fd91.300x300x1.jpg',
   'header_image_url': 'https://images.genius.com/d8d4a9abab290dea00e9e03bdc40fd91.468x468x1.jpg',
   'id': 872354,
   'lyrics_owner_id': 1549345,
   'lyrics_state': 'complete',


In [70]:
with open('../data/song_details_1.json', 'w') as f:
    json.dump(song_details_1, f)

##### A Look at Song Details

##### Song Details Heirarchy

In [102]:
# song_details[0]['response']['song']                                   actual song details
# song_details[0]['response']['song']['id']                             song id
# song_details[0]['response']['song']['writer_artists']                 credits!
# song_details[0]['response']['song']['writer_artists'][0]              index for individual writer

{'api_path': '/artists/1048332',
 'header_image_url': 'https://assets.genius.com/images/default_avatar_300.png?1538666595',
 'id': 1048332,
 'image_url': 'https://assets.genius.com/images/default_avatar_300.png?1538666595',
 'is_meme_verified': False,
 'is_verified': False,
 'name': 'Dana Calitri',
 'url': 'https://genius.com/artists/Dana-calitri'}