### Imports

In [1]:
import requests
from urllib.parse import quote

from bs4 import BeautifulSoup

import pickle as pkl 

### Load Data

In [38]:
"""
Load artist names.
"""
with open('artist_names.pkl', 'rb') as f: 
    artists = pkl.load(f)
    

In [None]:
"""
Load artist API paths. 
"""
with open('all_artist_paths.pkl', 'rb') as f:
    all_artist_paths = pkl.load(f)

In [40]:
"""
Load song paths for every artist. 
"""
with open('all_song_paths.pkl', 'rb') as f: 
    all_song_paths = pkl.load(f)

### API Request and Scraping Functions

In [2]:
# base genius endpoint
url_api = "https://api.genius.com"

# headers for API requests
headers = {"Authorization": "Bearer " + client_access_token, "User-Agent":""}

# access token and request headers for API query
with open('client_access_token.txt', 'r') as f:
    client_access_token = f.read()


In [3]:
def get_artist_path(artist_name):
    # generate and store url, modify artist name to remove spaces
    url_search = "/search?q=" 
    querystring = url_api + url_search + quote(artist_name)
    # get API response
    response = requests.get(querystring, headers=headers)
    response_artist = response.json()
    # pull artist url -- assumes they are the primary artist in the first search result since all searches return song objects
    url_artist = response_artist['response']['hits'][0]['result']['primary_artist']['api_path']

    return url_artist

In [15]:
def get_song_list(url_artist):
    # get the first per_page songs returned for that artist
    per_page = 50
    querystring = url_api + url_artist + "/songs" + "?per_page=" + str(per_page)
    response_songs = requests.get(querystring, headers = headers)
    songs = response_songs.json()

    # reduce songs to only those where target artist is the primary artist
    drops = []
    for index, song in enumerate(songs['response']['songs']):
        if song['primary_artist']['api_path'] != url_artist:
            drops.append(index)
    for index in sorted(drops, reverse=True):
        del songs['response']['songs'][index]

    return songs

In [5]:
def get_song_paths(songs):
    song_paths = []
    for song in songs['response']['songs']: 
        song_paths.append(song['api_path'])
    return song_paths


In [6]:
# def get_artist_songs(artist):
#     path = all_artist_paths[artist]
#     songs = get_song_list(path)
#     paths = get_song_paths(songs)

#     return paths

In [10]:
# def get_all_artist_songs(artist_names):
#     artist_songs = {}
#     for artist in artist_names:
#         try:
#             artist_songs[artist] = get_artist_songs(artist)
#         except:
#             artist_songs[artist] = None
#     return artist_songs



In [63]:
def get_lyrics(song_path):
    url = 'https://genius.com' + song_path
    lyrics_page = requests.get(url).text
    lyrics_soup = BeautifulSoup(lyrics_page)
    lyrics = lyrics_soup.find_all('div', class_='lyrics')[0].find('p').text
    return lyrics


### Scraping and API Requests

In [7]:
"""
Scraping/cleaning/storing artist names from Wikipedia.
"""

# # Scrape ~1400 popular hip hop musicians from wikipedia
# response = requests.get('https://en.wikipedia.org/wiki/List_of_hip_hop_musicians').text
# soup = BeautifulSoup(response)

# artist_tags = []
# for item in soup.find_all('li'): 
#     artist_tags.append(item.text)

# artists = artist_tags[29:-61]

# # remove annotation links (e.g. 'Drake[1]' -> 'Drake')
# # remove parenthesized annotations
# annotations_list = set([artist for artist in artists if artist[-1] == ']'])
# drops = []
# for index, artist in enumerate(artists):
#     if artist in annotations_list: 
#         artists[index] = artist[:-3]
#     if artist == 'Torch (American)': 
#         artists[index] = 'Torch (Triple C)'
#     if artist == 'Torch (German)':
#         artists[index] = 'Torch'
#     if artist == 'Casanova (rapper)':
#         artists[index] = 'Casanova'
#     if artist == 'Alias (musician)' or artist == 'Juice (Đus)':
#         drops.append(index)

# for index in sorted(drops, reverse=True): 
#     del artists[index]

# # Store as pickle
# with open('artist_names.pkl', 'rb') as f: 
#     pkl.dump(artists, f)


In [18]:
"""
Pull and store artist API paths. 
"""
# # Pull artist API path for each artist in list
# all_artist_paths = {}
# for artist in artists:
#     try:
#         all_artist_paths[artist] = get_artist_path(artist)
#     except:
#         all_artist_paths[artist] = None    
#
# with open('all_artist_paths.pkl', 'wb') as f:
#     pkl.dump(all_artist_paths, f)


In [38]:
"""
Pull and store song paths by artist. 
"""
# all_song_paths = {}
# for artist in artists:
#     path = all_artist_paths[artist]
#     try: 
#         songs = get_song_list(path)
#         all_song_paths[artist] = get_song_paths(songs)
#     except:
#         all_song_paths[artist] = None

# with open('all_song_paths.pkl', 'wb') as f:
#     pkl.dump(all_song_paths, f)

In [67]:
"""
Scrape and store lyrics for each song
"""
all_song_lyrics = {}
for artist in artists:
    all_song_lyrics[artist] = {}
    try:       
        for song in all_song_paths[artist]:
            lyrics = get_lyrics(song)
            all_song_lyrics[artist][song] = lyrics
    except:
        pass

In [72]:
with open('all_song_lyrics.pkl', 'wb') as f:
    pkl.dump(all_song_lyrics, f)

12