In [1]:
# Import libraries.
import pandas as pd
import numpy as np
import requests
import copy
from bs4 import BeautifulSoup, Comment, NavigableString
import sys, codecs, json

In [2]:
# Store personal API key as variable.
API_KEY = '______'

Since the free MusixMatch API key I was granted only allows for 2000 API calls, I broke the artists for whom song data will be pulled into two sets.  Artists were deliberately selected as being very synonymous with particular genres.

In [4]:
# Imbedded dictionaries of genre: artist: albums: songs.
artist_albums = {'heavy metal': {'metallica': {},
                                 'slayer': {},
                                 'judas priest': {}},
                 'rap': {'wu-tang clan': {},
                         'nas': {},
                         '2pac': {}},
                 'country': {'johnny cash': {},
                             'patsy cline': {},
                              'willie nelson': {}},
                 'pop': {'taylor swift': {},
                         'katy perry': {},
                         'lady gaga': {}},
                 'rock': {'def leppard': {},
                          'poison': {},
                          'mötley crüe': {}}}

In [63]:
# # Imbedded dictionaries of genre: artist: albums: songs.
# artist_albums = {'heavy metal': {'megadeth': {},
#                                  'iron maiden': {},
#                                  'lamb of god': {}},
#                  'rap': {'dr. dre': {},
#                          'eminem': {},
#                          'the notorious b.i.g.': {}},
#                  'country': {'garth brooks': {},
#                              'toby keith': {}},
#                  'pop': {'backstreet boys': {},
#                          'britney spears': {},
#                          '*nsync': {}},
#                  'rock': {'ac/dc': {},
#                           'van halen': {},
#                           'led zeppelin': {}}}

Since the MusixMatch API only provides 30% of a song's lyrics, I found another site suitable for scraping and combined it with the song data pulled from MusixMatch.  This segment pulls all the data from MusixMatch.

In [7]:
# Format artist names for Musixmatch API search.
artists = [artist for genre in artist_albums for artist in artist_albums[genre]]

# Retrieve all Musixmatch API artist ID's 
artist_ids = [requests.get('https://api.musixmatch.com/ws/1.1/artist.search?format=json&callback=callback&q_artist=' + artist + '&apikey=' + str(API_KEY)).json()['message']['body']['artist_list'][0]['artist']['artist_id'] 
              for artist in artists]

# Use Musixmatch API artist ID's to retrieve album list for each artist.
albums = [requests.get('https://api.musixmatch.com/ws/1.1/artist.albums.get?format=json&callback=callback&artist_id=' + str(ID) + '&page_size=100&apikey=' + str(API_KEY)).json()['message']['body']['album_list'] 
          for ID in artist_ids]

# Pair each album with its corresponding Musixmatch API album ID.
album_id_list = [(albums[i][j]['album']['artist_name'].lower(),
                  albums[i][j]['album']['album_name'].lower(),
                  albums[i][j]['album']['album_id']) 
                 for i in range(len(albums)) 
                 for j in range(len(albums[i]))]

# Store all albums under their respective artist in 'artist_albums' dictionary.
for genre in artist_albums:
    for artist in artist_albums[genre]:
        for artist_, album_, ID in album_id_list:
            if artist_ == artist:
                artist_albums[genre][artist][album_] = {}
                
# Use Musixmatch API album ID's to retrieve track list for each album and pair track with album title and remove duplicates.
album_tracks = []

for artist,album, ID in album_id_list:
    api_track_info = requests.get('https://api.musixmatch.com/ws/1.1/album.tracks.get?format=json&callback=callback&album_id=' + str(ID) + '&apikey=' + str(API_KEY))
    track_list = api_track_info.json()['message']['body']['track_list']
    for i in range(len(track_list)):
        if track_list[i]['track']['instrumental'] == 0:
            track = track_list[i]['track']['track_name'].lower().split('-')[0].rstrip()
            if (artist,album,track) not in album_tracks:
                album_tracks.append((artist,album,track))
                
# Save track lists under each album in 'artist_album' dictionary.
for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for artist_, album_, track in album_tracks:
                if album_ == album:
                    artist_albums[genre][artist][album][track] = ''

This segment searches 'lyrics.wikia.com' for the artist and tracks and scrapes the lyrics.

In [23]:
# Scrap lyrics to all tracks in 'artist_album' dictionary and save to csv's.
no_lyrics = []

for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for track in artist_albums[genre][artist][album]:
                if artist == '*nsync':
                    singer = artist.upper()
                else: 
                    singer = artist.replace(' ', '_')
                
                song = track.replace(' ', '_')
                
                r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song))
                s = BeautifulSoup(r.text)
                
                # Get main lyrics holder
                lyrics = s.find("div",{'class':'lyricbox'})
                
                # Collect songs for which lyrics were not found.
                if lyrics is None:
                    no_lyrics.append((artist, track))
                else:
                    # Remove Scripts
                    [s.extract() for s in lyrics('script')]
                    
                    # Remove Comments
                    comments = lyrics.findAll(text=lambda text:isinstance(text, Comment))
                    [comment.extract() for comment in comments]    
                    
                    # Remove unecessary tags
                    for tag in ['div','i','b','a']:
                        for match in lyrics.findAll(tag):
                            match.replaceWithChildren()
                            
                    # Get output as a string and remove non unicode characters and replace <br> with newlines
                    output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('<br/>',' ')
                    
                    try:
                        artist_albums[genre][artist][album][track] = output
                    except:
                        artist_albums[genre][artist][album][track] = str(output.encode('utf-8'))

In [24]:
# Create DataFrame with lyrics, song, album, artist and genre.
df_list = []

for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for track in artist_albums[genre][artist][album]:
                data = [artist_albums[genre][artist][album][track], track, album, artist, genre]
                df_list.append(data)

master = pd.DataFrame(df_list, columns=['lyrics', 'song', 'album', 'artist', 'genre'])
master.drop_duplicates(subset='lyrics', inplace=True)
master = master.dropna(axis=0, how='any')

In [25]:
# Calculate the difference between songs collected from MusixMatch API and those with lyrics scraped from 'lyrics.wikia.com'
artists_ = []
album_sums = []

for genre in artist_albums:
    for artist in artist_albums[genre]:
        artists_.append(artist)
        track_totals = []
        for album in artist_albums[genre][artist]:
            album_tracks_ = []
            for track in artist_albums[genre][artist][album]:
                album_tracks_.append(track)
            track_count = len(album_tracks_)
            track_totals.append(track_count)
        sums = sum(track_totals)
        album_sums.append(sums)
        
after_lyrics = master.artist.value_counts().sort_index()

before_lyrics = pd.Series(album_sums, index=artists_).sort_index()

track_diff = []
for ind in before_lyrics.index:
    diff = before_lyrics[ind] - after_lyrics[ind]
    track_diff.append(diff)
    
lyrics_difference = pd.Series(track_diff, index=before_lyrics.index)

DF = pd.concat([before_lyrics,after_lyrics,lyrics_difference], axis=1)
DF.columns=['total tracks', 'tracks w/ lyrics', 'difference']
DF

Unnamed: 0,total tracks,tracks w/ lyrics,difference
2pac,374,120,254
def leppard,249,123,126
johnny cash,353,178,175
judas priest,168,145,23
katy perry,109,45,64
lady gaga,122,62,60
metallica,141,96,45
mötley crüe,220,83,137
nas,357,134,223
patsy cline,703,95,608


In [28]:
#Save Dataframe to csv.
master.to_csv('master_lyrics_data.csv', index=False)

A look at the songs that did not have lyrics shows that most have weird tags like "live" or "edit".

In [32]:
no_lyrics[:15]

[('metallica', 'battery (live)'),
 ('metallica', 'the thing that you should not be (live)'),
 ('metallica', 'damage inc.'),
 ('metallica', '…and justice for all'),
 ('metallica', '…and justice for all (edit)'),
 ('metallica', '…and justice for all (lp version)'),
 ('metallica', 'manunkind'),
 ('metallica', 'am i savage?'),
 ('metallica', 'the unforgiven ii'),
 ('metallica', 'devil’s dance'),
 ('metallica', 'the unforgiven iii'),
 ('metallica', 'un3'),
 ('metallica', 'the unforgiven iii'),
 ('metallica',
  "mercyful fate: satan's fall / curse of the pharaohs / a corpse without soul / into the coven / evil"),
 ('metallica', 'ain’t my bitch')]