In [1]:
import pandas as pd
import numpy as np
import requests
import copy
from bs4 import BeautifulSoup, Comment, NavigableString
import sys, codecs, json

In [2]:
API_KEY = '__________________'

In [91]:
# Imbedded dictionaries of genre: artist: albums: songs.
artist_albums = {'heavy metal': {'metallica': {'ride the lightning':'', 
                                                'master of puppets':'', 
                                                '...and justice for all':''},
                                  'slayer': {'reign in blood':'', 
                                             'south of heaven':'', 
                                             'seasons in the abyss':''},
                                  'judas priest': {'british steel':'', 
                                                   'screaming for vengeance':'', 
                                                   'painkiller':''}},
                 'rap': {'wu tang clan': {'enter the wu-tang: 36 chambers':'', 
                                           'the w':'', 
                                           'iron flag':''},
                          'nas': {'illmatic':'', 
                                  'it was written':'', 
                                  'i am...':''},
                          '2pac': {'strictly 4 my n.i.g.g.a.z...':'', 
                                   'me against the world':'', 
                                   'all eyez on me':''}},
                 'country': {'johnny cash': {'man in black':'', 
                                              'at san quentin':'', 
                                              'at folsom prison':''},
                              'patsy cline': {'patsy cline':'', 
                                              'showcase':'', 
                                              'sentimentally yours':''},
                              'willie nelson': {'stardust':'', 
                                                'shotgun willie':'', 
                                                'red headed stranger':''}},
                 'pop': {'taylor swift': {'fearless':'', 
                                           'speak now':'', 
                                           '1989':''},
                          'katy perry': {'teenage dream':'', 
                                         'prism':'', 
                                         'witness':''},
                          'ariana grande': {'dangerous woman':'', 
                                            'sweetener':'', 
                                            'thank u, next':''}},
                 'rock': {'def leppard': {"high 'n' dry":'', 
                                           'pyromania':'', 
                                           'hysteria':''},
                           'poison': {'open up and say...ahh! - 20th anniversary edition':'', 
                                      'look what the cat dragged in':'', 
                                      'flesh & blood':''},
                           'motley crue': {'shout at the devil':'', 
                                           'girls, girls, girls':'', 
                                           'dr. feelgood':''}}}

In [95]:
# Format artist names for Musixmatch API search.
artist_no_spaces = []
for genre in artist_albums:
    for artist in artist_albums[genre]:
        artist_no_spaces.append(artist.replace(' ', '%20'))
    
artist_no_spaces

['metallica',
 'slayer',
 'judas%20priest',
 'wu%20tang%20clan',
 'nas',
 '2pac',
 'johnny%20cash',
 'patsy%20cline',
 'willie%20nelson',
 'taylor%20swift',
 'katy%20perry',
 'ariana%20grande',
 'def%20leppard',
 'poison',
 'motley%20crue']

In [6]:
# Retrieve all Musixmatch API artist ID's 
artist_ids = []
for artist in artist_no_spaces:
    artist_id_info = requests.get('https://api.musixmatch.com/ws/1.1/artist.search?format=json&callback=callback&q_artist=' + artist + '&apikey=' + str(API_KEY))
    artist_ids.append(artist_id_info.json()['message']['body']['artist_list'][0]['artist']['artist_id'])
    
artist_ids

[64,
 2683,
 767,
 13882826,
 1156,
 99,
 225,
 1531,
 3849,
 259675,
 190034,
 13958599,
 145,
 191238,
 12605]

In [7]:
# Use Musixmatch API artist ID's to retrieve album list for each artist.
albums = []

for ID in artist_ids:
    api_album_info = requests.get('https://api.musixmatch.com/ws/1.1/artist.albums.get?format=json&callback=callback&artist_id=' + str(ID) + '&page_size=100&apikey=' + str(API_KEY))
    albums.append(api_album_info.json()['message']['body']['album_list'])

In [8]:
# Pair each album with its corresponding Musixmatch API album ID.
album_id_list = []

for i in range(len(albums)):
    for j in range(len(albums[i])):
        album_id_list.append((albums[i][j]['album']['album_name'].lower(),albums[i][j]['album']['album_id']))

In [9]:
album_id_list[:5]

[('metallica', 10276730),
 ('metallica', 10294108),
 ('metallica', 10938929),
 ('metallica', 13762536),
 ('metallica', 13763879)]

In [97]:
# Keep only albums specified in 'artist_albums' dictionary.
id_list = []

for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for album_id_pair in album_id_list:
                if album_id_pair[0] == album:
                    id_list.append(album_id_pair)

In [98]:
id_list[:5]

[('ride the lightning', 10276803),
 ('ride the lightning', 10667105),
 ('ride the lightning', 10753511),
 ('ride the lightning', 11344509),
 ('ride the lightning', 13768873)]

In [72]:
# Use Musixmatch API album ID's to retrieve track list for each album and pair track with album title.
album_tracks = []

for pair in id_list:
    api_track_info = requests.get('https://api.musixmatch.com/ws/1.1/album.tracks.get?format=json&callback=callback&album_id=' + str(pair[1]) + '&apikey=' + str(API_KEY))
    track_list = api_track_info.json()['message']['body']['track_list']
    for i in range(len(track_list)):
        if track_list[i]['track']['instrumental'] == 0:
            track = track_list[i]['track']['track_name'].lower()
            if (pair[0],track) not in album_tracks:
                album_tracks.append((pair[0],track))

In [73]:
album_tracks[:5]

[('ride the lightning', 'fight fire with fire'),
 ('ride the lightning', 'ride the lightning'),
 ('ride the lightning', 'for whom the bell tolls'),
 ('ride the lightning', 'fade to black'),
 ('ride the lightning', 'trapped under ice')]

In [100]:
# Save track lists under each album in 'artist_album' dictionary.
for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            tracks = []
            for i in range(len(album_tracks)):
                if album_tracks[i][0] == album:
                    tracks.append(album_tracks[i][1])
            artist_albums[genre][artist][album] = tracks

In [103]:
artist_albums['heavy metal']['metallica']

{'ride the lightning': ['fight fire with fire',
  'ride the lightning',
  'for whom the bell tolls',
  'fade to black',
  'trapped under ice',
  'escape',
  'creeping death',
  'for whom the bell tolls - live',
  'creeping death - live',
  'for whom the bell tolls (live version)',
  'creeping death (live version)'],
 'master of puppets': ['battery',
  'master of puppets',
  'the thing that should not be',
  'welcome home (sanitarium)',
  'disposable heroes',
  'leper messiah',
  'damage, inc.',
  'battery (live)',
  'the thing that should not be (live)',
  'the thing that you should not be (live)',
  'welcome home (sanitarium) - sanitarium',
  'damage inc.'],
 '...and justice for all': ['blackened',
  '...and justice for all',
  'eye of the beholder',
  'one',
  'the shortest straw',
  'harvester of sorrow',
  'the frayed ends of sanity',
  'to live is to die',
  'dyers eve',
  'one - live']}

In [16]:
# Scrap lyrics to all tracks in 'artist_album' dictionary and save to csv's.
for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for track in artist_albums[genre][artist][album]:
                singer = artist.replace(' ', '_')
                song = track.replace(' ', '_')
                r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song))
                s = BeautifulSoup(r.text)
                #Get main lyrics holder
                lyrics = s.find("div",{'class':'lyricbox'})
                if lyrics is None:
                    continue
                else:
                    #Remove Scripts
                    [s.extract() for s in lyrics('script')]
                    #Remove Comments
                    comments = lyrics.findAll(text=lambda text:isinstance(text, Comment))
                    [comment.extract() for comment in comments]    
                    #Remove unecessary tags
                    for tag in ['div','i','b','a']:
                        for match in lyrics.findAll(tag):
                            match.replaceWithChildren()
                    #Get output as a string and remove non unicode characters and replace <br> with newlines
                    output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('<br/>','\n')
                    try:
                        file_lyrics = open(song + '_lyrics.txt', 'w')
                        file_lyrics.write(output)
                        file1.close()
                    except:
                        file_lyrics = open(song + '_lyrics.txt', 'w')
                        file_lyrics.write(output.encode('utf-8'))
                        file_lyrics.close()

In [106]:
# Create DataFrame with lyrics, song, album, artist and genre.
df_list = []

for genre in artist_albums:
    for artist in artist_albums[genre]:
        for album in artist_albums[genre][artist]:
            for track in artist_albums[genre][artist][album]:
                try:
                    f = open(track.replace(' ', '_') + '_lyrics.txt', 'r')
                    x = f.readlines()
                    f.close()
                except:
                    continue
                clean = []
                for line in x:
                    if line[-1:] == '\n':
                        clean.append(line[:-1])
                    else:
                        clean.append(line)
                df = pd.DataFrame(clean, columns=['lyrics'])
                df.lyrics.replace('', np.nan, inplace=True)
                df.dropna(subset=['lyrics'], inplace=True)
                df['song'] = track
                df['album'] = album
                df['artist'] = artist
                df['genre'] = genre
                df_list.append(df)

master = pd.concat(df_list).reset_index(drop=True)

In [107]:
master.head()

Unnamed: 0,lyrics,song,album,artist,genre
0,Do unto others as they've done to you,fight fire with fire,ride the lightning,metallica,heavy metal
1,But what the hell is this world coming to?,fight fire with fire,ride the lightning,metallica,heavy metal
2,Blow the universe into nothingness,fight fire with fire,ride the lightning,metallica,heavy metal
3,Nuclear warfare shall lay us to rest,fight fire with fire,ride the lightning,metallica,heavy metal
4,Fight fire with fire,fight fire with fire,ride the lightning,metallica,heavy metal


In [77]:
#Save Dataframe to csv.
master.to_csv('master_lyrics_data.csv', index=False)