Load relevant libraries

In [6]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import sqlite3
from sqlite3 import Error

Define parameters you need

In [16]:
#parameters
myToken = " " ## put your token here!
search_terms=["Adele","Future"] ### pick whichever artists are of interest

### Getting artists and their id's

Using the search terms (artist names) specified we will return the artist ids to carry through the ETL

In [8]:
def get_artists_ids(search_terms, myToken):
    artist_ids = []
    for search_term in search_terms:
        genius_search_url=f"http://api.genius.com/search?q={search_term}&access_token={myToken}"
        response = requests.get(genius_search_url)
        json_data = response.json()
        for artist_id in json_data['response']['hits']:
            artist_ids.append([artist_id['result']['primary_artist']['id'],artist_id['result']['primary_artist']['name']])
    artist_ids_df = pd.DataFrame(artist_ids)
    artist_ids_df.columns = ['artist_id', 'artist_name']
    artist_ids_df=artist_ids_df.drop_duplicates(subset = "artist_id")
    return(artist_ids_df)
            


In [9]:
get_artists_ids(search_terms, myToken)

Unnamed: 0,artist_id,artist_name
0,2300,Adele
7,33242,Idina Menzel
9,20698,Davido
10,2197,Future
11,568186,Drake & Future
12,1403568,"Jay Rock, Kendrick Lamar, Future & James Blake"
13,1739,Odd Future
17,345077,Joyner Lucas
18,615550,Billie Eilish


We use the artist id table to get a list of all artist ids to pass through the remaining functions

In [13]:
artist_ids_df=get_artists_ids(search_terms, myToken)
artist_ids =  list(artist_ids_df["artist_id"])

Using the list of all artist ids of interest and passing them into the artist api call to get artist information

In [17]:
def get_artist_information(artist_ids, myToken):
    artist_information= []
    for artist_id in artist_ids:
        myUrl = f"https://api.genius.com/artists/{artist_id}"
        head = {'Authorization': 'Bearer {}'.format(myToken)}
        artist_response = requests.get(myUrl, headers=head)
        artist_info = artist_response.json()
        artist_information.append([artist_info['response']['artist']['id'],artist_info['response']['artist']['name'],artist_info['response']['artist']['header_image_url'],artist_info['response']['artist']['url'],artist_info['response']['artist']['facebook_name'],artist_info['response']['artist']['instagram_name'],artist_info['response']['artist']['twitter_name']])
        artist_df = pd.DataFrame(artist_information)  
        artist_df.columns = ['artist_id', 'artist_name','header_image_url','url','facebook_name','instagram_name','twitter_name']
    
    return artist_df

In [19]:
get_artist_information(artist_ids, myToken)

Unnamed: 0,artist_id,artist_name,header_image_url,url,facebook_name,instagram_name,twitter_name
0,2300,Adele,https://images.genius.com/87aa5d8c32965a10e0e7...,https://genius.com/artists/Adele,adele,adele,Adele
1,33242,Idina Menzel,https://images.genius.com/8dae7cbdcc7da145e8a1...,https://genius.com/artists/Idina-menzel,,idinamenzel,idinamenzel
2,20698,Davido,https://images.genius.com/a19831630fffbd21d451...,https://genius.com/artists/Davido,Davido,davidoofficial,iam_Davido
3,2197,Future,https://images.genius.com/63f23f8b6cc6a664e9d0...,https://genius.com/artists/Future,FutureOfficial,future,1future
4,568186,Drake & Future,https://images.genius.com/6fe1d961adc9ecff9787...,https://genius.com/artists/Drake-and-future,,,
5,1403568,"Jay Rock, Kendrick Lamar, Future & James Blake",https://images.genius.com/b5de44b4cd4074c83261...,https://genius.com/artists/Jay-rock-kendrick-l...,,,
6,1739,Odd Future,https://images.genius.com/6d8e82b376ba175cfb19...,https://genius.com/artists/Odd-future,,,
7,345077,Joyner Lucas,https://images.genius.com/e4a1f4f393a6f2895ef7...,https://genius.com/artists/Joyner-lucas,JoynerLucas,JoynerLucas,JoynerLucas
8,615550,Billie Eilish,https://images.genius.com/a3ed3b2ef89f2e09aff8...,https://genius.com/artists/Billie-eilish,billieeilish,billieeilish,billieeilish


# Getting songs for artists 

Because the artist song calls are paginated the below function collects how many pages the artist has to use in other functions. 

Making sure we collect all songs by ensuring we grab all pages of songs by artist

In [20]:
def get_artist_pages(artist_id,myToken):
    new_results = True
    page = 1
    pages_list=[]
    while new_results:
        myUrl = f"https://api.genius.com/artists/{artist_id}/songs?per_page=50&page={page}"
        head = {'Authorization': 'Bearer {}'.format(myToken)}
        songs_response = requests.get(myUrl, headers=head)
        songs_json=songs_response.json()
        new_results = isinstance(songs_json['response']['next_page'], int)
        pages_list.append(page)
        page += 1
        
    return (pages_list)

Ensuring we loop through all artists of interest and using the page function to loop through all pages for each artist

In [21]:
def get_artist_song_information(artist_ids,myToken):
    artist_song=[]
    for artist_id in artist_ids:
        pages=get_artist_pages(artist_id,myToken)
        for page in pages:
            myUrl = f"https://api.genius.com/artists/{artist_id}/songs?per_page=50&page={page}"
            head = {'Authorization': 'Bearer {}'.format(myToken)}
            songs_response = requests.get(myUrl, headers=head)
            songs_json=songs_response.json()
            for song in songs_json['response']['songs']:
                artist_song.append([artist_id,song['id'],song['title'],song['url'],song['primary_artist']['id'],song['release_date_components'],song['release_date_for_display']])
    
    artist_songs_df = pd.DataFrame(artist_song)
    artist_songs_df.columns = ['artist_id', 'song_id','title','url','primary_artist_id','date_components','date_display']
    return artist_songs_df

In [22]:
get_artist_song_information(artist_ids,myToken)

Unnamed: 0,artist_id,song_id,title,url,primary_artist_id,date_components,date_display
0,2300,2452146,19 [Booklet],https://genius.com/Adele-19-booklet-annotated,2300,"{'year': 2008, 'month': 1, 'day': 28}","January 28, 2008"
1,2300,2985039,2017 Grammy’s Song of the Year Speech,https://genius.com/Adele-2017-grammys-song-of-...,2300,"{'year': 2017, 'month': 2, 'day': 12}","February 12, 2017"
2,2300,2452149,21 [Booklet],https://genius.com/Adele-21-booklet-annotated,2300,"{'year': 2011, 'month': 1, 'day': 24}","January 24, 2011"
3,2300,2452122,25 [Booklet],https://genius.com/Adele-25-booklet-annotated,2300,"{'year': 2015, 'month': 11, 'day': 20}","November 20, 2015"
4,2300,2378169,25 Thank You Letter,https://genius.com/Adele-25-thank-you-letter-l...,2300,"{'year': 2015, 'month': 11, 'day': 20}","November 20, 2015"
...,...,...,...,...,...,...,...
3284,615550,4586636,You should see me in a crown (IIZI Remix),https://genius.com/Billie-eilish-you-should-se...,615550,"{'year': 2019, 'month': 2, 'day': 9}","February 9, 2019"
3285,615550,4587674,You should see me in a crown[killer frost remix],https://genius.com/Billie-eilish-you-should-se...,615550,"{'year': 2018, 'month': 8, 'day': 15}","August 15, 2018"
3286,615550,4586552,You should see me in a crown[live from Ellen],https://genius.com/Billie-eilish-you-should-se...,615550,"{'year': 2018, 'month': 10, 'day': 10}","October 10, 2018"
3287,615550,3983228,You should see me in a crown (Remix),https://genius.com/Childish-major-you-should-s...,33062,"{'year': 2018, 'month': 9, 'day': 26}","September 26, 2018"


The API does NOT include lyrics which were something we wanted for future analyses. Creating a basic scraper to scrape the urls, only some of the urls are lyrics links

In [25]:
def scrape_song_lyrics_from_url(URL):
        """Use BeautifulSoup to scrape song info off of a Genius song URL"""                                
        page = requests.get(URL)    
        html = BeautifulSoup(page.text, 'html.parser')
        
        # Scrape the song lyrics from the HTML
        lyrics = html.find('div', class_='Lyrics__Container-sc-1ynbvzw-6 YYrds').get_text().encode('utf-8', 'ignore').decode('utf-8')
        
        return lyrics

Because not all urls are lyrics creating a try catch function to pass all the urls that arent the relevant html pages for lyrics

In [26]:
def catch(url):
    try:
        return scrape_song_lyrics_from_url(url)
    except:
        pass

Creating song table from the artist songs pull

In [27]:
### get song table

def get_song_df(artist_ids,myToken):
    artist_songs=get_artist_song_information(artist_ids,myToken)
    songs=artist_songs[['song_id','title','url','date_display']]
    songs['lyrics']=songs.apply(lambda x: catch(x['url']), axis=1)
    song_df=songs.drop_duplicates(subset = "song_id")
    
    return songs

    

    

In [28]:
get_song_df(artist_ids,myToken)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs['lyrics']=songs.apply(lambda x: catch(x['url']), axis=1)


Unnamed: 0,song_id,title,url,date_display,lyrics
0,2452146,19 [Booklet],https://genius.com/Adele-19-booklet-annotated,"January 28, 2008",
1,2985039,2017 Grammy’s Song of the Year Speech,https://genius.com/Adele-2017-grammys-song-of-...,"February 12, 2017",[Adele]Thank you very much. Thank you. First o...
2,2452149,21 [Booklet],https://genius.com/Adele-21-booklet-annotated,"January 24, 2011",
3,2452122,25 [Booklet],https://genius.com/Adele-25-booklet-annotated,"November 20, 2015",
4,2378169,25 Thank You Letter,https://genius.com/Adele-25-thank-you-letter-l...,"November 20, 2015",Thank you for the time I've been givenThank yo...
...,...,...,...,...,...
3284,4586636,You should see me in a crown (IIZI Remix),https://genius.com/Billie-eilish-you-should-se...,"February 9, 2019","[Verse 1]Bite my tongue, bide my timeWearing a..."
3285,4587674,You should see me in a crown[killer frost remix],https://genius.com/Billie-eilish-you-should-se...,"August 15, 2018","[Verse 1]Bite my tongue, bide my timeWearing a..."
3286,4586552,You should see me in a crown[live from Ellen],https://genius.com/Billie-eilish-you-should-se...,"October 10, 2018","[Verse 1]Bite my tongue, bide my timeWearing a..."
3287,3983228,You should see me in a crown (Remix),https://genius.com/Childish-major-you-should-s...,"September 26, 2018","[Verse 1: Billie Eilish]Bite my tongue, bide m..."


Creating an artist song table from the artist song pull

In [796]:
## get artist song table

def get_artist_song_table(artist_ids,myToken):
    artist_songs=get_artist_song_information(artist_ids,myToken)
    artist_songs_df=artist_songs[['artist_id','song_id','primary_artist_id']]
    artist_songs_df.loc[artist_songs_df['artist_id']==artist_songs_df['primary_artist_id'], 'is_primary'] = '1'  
    artist_songs_df.loc[artist_songs_df['artist_id']!=artist_songs_df['primary_artist_id'], 'is_primary'] = '0'
    artist_songs_final_df=artist_songs_df[['artist_id','song_id','is_primary']] ## this flag will determine if the artist listed was the primary artist for the song id 
    return(artist_songs_final_df)


In [797]:
get_artist_song_table(artist_ids,myToken)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_songs_df.loc[artist_songs_df['artist_id']==artist_songs_df['primary_artist_id'], 'is_primary'] = '1'


Unnamed: 0,artist_id,song_id,is_primary
0,2300,2452146,1
1,2300,2985039,1
2,2300,2452149,1
3,2300,2452122,1
4,2300,2378169,1
...,...,...,...
304,1207590,3253625,1
305,1207590,3218543,1
306,1207590,3284985,1
307,1207590,4069990,1
