### Imports

In [2]:
import requests
from urllib.parse import quote

from bs4 import BeautifulSoup

import pickle as pkl 

### Load Data
Here you can load the resulting Data from these API requests and scrapes. A Genius API access token is required to run the request code. 

In [3]:
"""
Load artist names.
"""
with open('data/artist_names.pkl', 'rb') as f: 
    artists = pkl.load(f)
    

In [4]:
"""
Load artist API paths. 
"""
with open('data/all_artist_paths.pkl', 'rb') as f:
    all_artist_paths = pkl.load(f)

In [5]:
"""
Load song paths for every artist. 
"""
with open('data/all_song_paths.pkl', 'rb') as f: 
    all_song_paths = pkl.load(f)

In [6]:
"""
Load all song lyrics.
"""
with open('data/all_song_lyrics.pkl', 'rb') as f: 
    all_song_lyrics = pkl.load(f)

### API Request and Scraping Functions

In [6]:
# base genius API endpoint
url_api = "https://api.genius.com"

# access token for API requests
# you will need to obtain your own access token from the Genius API to run these requests
with open('client_access_token.txt', 'r') as f:
    client_access_token = f.read()

# headers for API requests
headers = {"Authorization": "Bearer " + client_access_token, "User-Agent":""}



In [7]:
def get_artist_profile(artist_name):
    """
    get entire API json object for an artist
    """
    url_search = "/search?q=" 
    querystring = url_api + url_search + quote(artist_name)
    # get API response
    response = requests.get(querystring, headers=headers)
    response_artist = response.json()
    return response_artist

In [14]:
def get_artist_path(artist_name):
    """
    get API path for an artist
    """
    # generate and store url, modify artist name to remove spaces
    url_search = "/search?q=" 
    querystring = url_api + url_search + quote(artist_name)
    # get API response
    response = requests.get(querystring, headers=headers)
    response_artist = response.json()
    # pull artist url -- assumes they are the primary artist in the first search result since all searches return song objects
    url_artist = response_artist['response']['hits'][0]['result']['primary_artist']['api_path']

    return url_artist

In [20]:
def get_artist_url(artist_name):
    """
    get genius.com url for an artist 
    """
    # generate and store url, modify artist name to remove spaces
    url_search = "/search?q=" 
    querystring = url_api + url_search + quote(artist_name)
    # get API response
    response = requests.get(querystring, headers=headers)
    response_artist = response.json()
    # pull artist url -- assumes they are the primary artist in the first search result since all searches return song objects
    url_artist = response_artist['response']['hits'][0]['result']['primary_artist']['url']

    return url_artist

In [15]:
def get_song_list(url_artist):
    """
    get the first 50 song objects for an artist
    **it should be noted that the Genius API lists essentially all content as a "song" object,
    meaning that this will return interview transcripts and other non-song content that must dealt with in processing
    """
    # get the first per_page songs returned for that artist
    per_page = 50
    querystring = url_api + url_artist + "/songs" + "?per_page=" + str(per_page)
    response_songs = requests.get(querystring, headers = headers)
    songs = response_songs.json()

    # reduce songs to only those where target artist is the primary artist
    drops = []
    for index, song in enumerate(songs['response']['songs']):
        if song['primary_artist']['api_path'] != url_artist:
            drops.append(index)
    for index in sorted(drops, reverse=True):
        del songs['response']['songs'][index]

    return songs

In [16]:
def get_song_paths(songs):
    """
    get API path for individual songs
    works (most of the time) as a genius.com URL for the song
    """
    song_paths = []
    for song in songs['response']['songs']: 
        song_paths.append(song['api_path'])
    return song_paths


In [17]:
def get_lyrics(song_path):
    """
    get song lyrics for a given song URL
    """
    url = 'https://genius.com' + song_path
    lyrics_page = requests.get(url).text
    lyrics_soup = BeautifulSoup(lyrics_page)
    lyrics = lyrics_soup.find_all('div', class_='lyrics')[0].find('p').text
    return lyrics


### Scraping and API Requests

In [13]:
"""
Scraping/cleaning/storing artist names from Wikipedia.
"""

# # Scrape ~1400 popular hip hop musicians from wikipedia
# response = requests.get('https://en.wikipedia.org/wiki/List_of_hip_hop_musicians').text
# soup = BeautifulSoup(response)

# artist_tags = []
# for item in soup.find_all('li'): 
#     artist_tags.append(item.text)

# artists = artist_tags[29:-61]

# # remove annotation links (e.g. 'Drake[1]' -> 'Drake')
# # remove parenthesized annotations
# annotations_list = set([artist for artist in artists if artist[-1] == ']'])
# drops = []
# for index, artist in enumerate(artists):
#     if artist in annotations_list: 
#         artists[index] = artist[:-3]
#     if artist == 'Torch (American)': 
#         artists[index] = 'Torch (Triple C)'
#     if artist == 'Torch (German)':
#         artists[index] = 'Torch'
#     if artist == 'Casanova (rapper)':
#         artists[index] = 'Casanova'
#     if artist == 'Alias (musician)' or artist == 'Juice (Đus)':
#         drops.append(index)

# for index in sorted(drops, reverse=True): 
#     del artists[index]

# # Store as pickle
# with open('artist_names.pkl', 'rb') as f: 
#     pkl.dump(artists, f)


'\nScraping/cleaning/storing artist names from Wikipedia.\n'

In [21]:
"""
Pull and store artist URL's

**note: this didn't check for primary_artist so it didn't pull properly for anyone who's not the primary artist on their first searched song
e.g. a bunch of people's names will link to Kanye
"""
# all_artist_urls = {}
# for artist in artists:
#     try:
#         all_artist_urls[artist] = get_artist_url(artist)
#     except:
#         all_artist_urls[artist] = None    

# with open('all_artist_urls.pkl', 'wb') as f:
#     pkl.dump(all_artist_urls, f)

In [14]:
"""
Pull and store artist API paths. 
"""
# # Pull artist API path for each artist in list
# all_artist_paths = {}
# for artist in artists:
#     try:
#         all_artist_paths[artist] = get_artist_path(artist)
#     except:
#         all_artist_paths[artist] = None    
#
# with open('all_artist_paths.pkl', 'wb') as f:
#     pkl.dump(all_artist_paths, f)


'\nPull and store artist API paths. \n'

In [15]:
"""
Pull and store song paths by artist. 
"""
# all_song_paths = {}
# for artist in artists:
#     path = all_artist_paths[artist]
#     try: 
#         songs = get_song_list(path)
#         all_song_paths[artist] = get_song_paths(songs)
#     except:
#         all_song_paths[artist] = None

# with open('all_song_paths.pkl', 'wb') as f:
#     pkl.dump(all_song_paths, f)

'\nPull and store song paths by artist. \n'

In [16]:
"""
Scrape and store lyrics for each song
"""
# all_song_lyrics = {}
# for artist in artists:
#     all_song_lyrics[artist] = {}
#     try:       
#         for song in all_song_paths[artist]:
#             lyrics = get_lyrics(song)
#             all_song_lyrics[artist][song] = lyrics
#     except:
#         pass

# with open('all_song_lyrics.pkl', 'wb') as f:
#     pkl.dump(all_song_lyrics, f)

'\nScrape and store lyrics for each song\n'

In [7]:
"""
Identify artists who we failed to pull lyrics for
"""
skipped_artists = [artist for artist in all_song_lyrics.keys() if all_song_lyrics[artist] == {}]

In [None]:
"""
Try scraping again in case it was due to some request error

**Preloaded data has already run this so it will throw an error. Uncommment for use on a fresh scrape.
"""
# skipped_song_lyrics = {}
# for artist in skipped_artists:
#     skipped_song_lyrics[artist] = {}  
#     for song in all_song_paths[artist]:
#         try:
#             lyrics = get_lyrics(song)
#             skipped_song_lyrics[artist][song] = lyrics
#         except:
#             pass

# all_song_lyrics_v2 = all_song_lyrics.copy()
# for artist in skipped_song_lyrics.keys(): 
#     all_song_lyrics_v2[artist] = skipped_song_lyrics[artist]

Store our lyrics and head over to topic modeling.

In [43]:
with open('data/all_song_lyrics_v2.pkl', 'wb') as f: 
    pkl.dump(all_song_lyrics_v2, f)