<img src="header.png">

## Setting up a discography dictionary of album information - to be cross referenced against web data in order to include data not available in the scrape 

In [74]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

In [75]:
albums = []
year = []
tracks = []

In [76]:
#couple of functions that I'll bake into the scraping script that will clean and parse at source
def albumsplitter(x):
    year = x.split("\n")[1]
    album= x.split("\n")[0]
    return album,year

In [50]:
driver = webdriver.Chrome('./chromedriver')

driver.get('http://www.metalstorm.net/bands/discography.php?band_id=176&bandname=Metallica') 

items = driver.find_elements_by_class_name('discography-album ')

albums = []
year = []
tracks = []

for each in items:
    #for i in each:
        title = each.find_element_by_class_name('album-title')
        try:
            albums.append(albumsplitter(title.text)[0])
        except:
            albums.append(np.nan)
        try:
            year.append(albumsplitter(title.text)[1])
        except:
            year.append(np.nan)   
            
        t = each.find_element_by_class_name('album-tracklist')

        try:
            tracks.append(t.text)
        except:
            tracks.append(np.nan)

In [51]:
#This is fairly clean
albums

["Kill 'Em All",
 'Ride The Lightning',
 'Creeping Death [EP]',
 'Master Of Puppets',
 'Garage Days Re-Revisited [EP]',
 'Harvester Of Sorrow [EP]',
 '...And Justice For All',
 'Metallica',
 'Live At Wembley Stadium [EP]',
 'Load',
 'Re-Load',
 'Garage Inc.',
 'S&M [Live]',
 'St. Anger',
 'The Unnamed Feeling [EP]',
 'Some Kind Of Monster [EP]',
 'Live From Live Earth [EP]',
 'Death Magnetic',
 'Six Feet Down Under [EP]',
 'Six Feet Down Under Part II [EP]',
 "Live At Grimey's [EP]",
 'Lulu [Collaboration]',
 'Beyond Magnetic [EP]',
 'Through The Never [Live]',
 'Liberté, Égalité, Fraternité, Metallica! [Live]',
 'Hardwired... To Self-Destruct']

In [52]:
#as we can see the track scrape is a little messy - we need to get rid of some line breaks, some tracks numbers 
# (I don't think they're needed) and some superfluous text e.g. mentions of reviews and lyrics
tracks[0:2]

['01. Hit The Lights\n02. The Four Horsemen\n03. Motorbreath\n04. Jump In The Fire\n05. (Anesthesia)-Pulling Teeth\n06. Whiplash\n07. Phantom Lord\n08. No Remorse\n09. Seek & Destroy\n10. Metal Militia\n11. Am I Evil? [Diamond Head cover] [Elektra reissue bonus]\n12. Blitzkrieg [Blitzkrieg cover] [Elektra reissue bonus]\n13. The Four Horsemen [live] [digital download bonus]\n14. Whiplash [live] [digital download bonus]\n\nTop 20 albums of 1983: 5\nReview Lyrics (12)',
 '01. Fight Fire With Fire\n02. Ride The Lightning\n03. For Whom The Bell Tolls\n04. Fade To Black\n05. Trapped Under Ice\n06. Escape\n07. Creeping Death\n08. The Call Of Ktulu\n\nTop 20 albums of 1984: 1\nTop 200 albums of all time: 3\nReviews (2) Lyrics (8)']

In [53]:
import regex as re
def cleaner(x): 
    return removenumbers(x).replace('. ','').replace('Top  albums of :','').split('\n')

def removenumbers(x):
    return re.sub("\d","",x)

In [54]:
#create a data frame of albums and tracks but apply some cleaning to get things a little tidier
clean_tracks = pd.DataFrame([cleaner(each) for each in tracks])

In [55]:
#looks a little better - in dataframe format it'll be eaiser for me to do some editing in order to get a tight and clean discography
clean_tracks.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,Hit The Lights,The Four Horsemen,Motorbreath,Jump In The Fire,(Anesthesia)-Pulling Teeth,Whiplash,Phantom Lord,No Remorse,Seek & Destroy,Metal Militia,...,,,,,,,,,,
1,Fight Fire With Fire,Ride The Lightning,For Whom The Bell Tolls,Fade To Black,Trapped Under Ice,Escape,Creeping Death,The Call Of Ktulu,,,...,,,,,,,,,,


In [56]:
#let's relabel the albums I'm interested in, the main studio albums basically
clean_tracks.rename(index={0:"Kill 'em All",1: 'Ride The Lightning',
                           3:'Master of Puppets',6:'...And Justice For All',
                           7:'Metallica',9:'Load',10:'Re-Load',11:'Garage Inc.',13:'St.Anger',
                           17:'Death Magnetic',25:'Hardwired...To Self Destruct'},inplace=True)
#2,4,5,8,11,12,14,15,16,18-24

In [57]:
clean_tracks

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
Kill 'em All,Hit The Lights,The Four Horsemen,Motorbreath,Jump In The Fire,(Anesthesia)-Pulling Teeth,Whiplash,Phantom Lord,No Remorse,Seek & Destroy,Metal Militia,...,,,,,,,,,,
Ride The Lightning,Fight Fire With Fire,Ride The Lightning,For Whom The Bell Tolls,Fade To Black,Trapped Under Ice,Escape,Creeping Death,The Call Of Ktulu,,,...,,,,,,,,,,
2,EP version:,Creeping Death,Am I Evil? [Diamond Head cover],Blitzkrieg [Blitzkrieg cover],,CD version:,Creeping Death,Am I Evil? [Diamond Head cover],Blitzkrieg [Blitzkrieg cover],Jump In The Fire,...,,,,,,,,,,
Master of Puppets,Battery,Master Of Puppets,The Thing That Should Not Be,Welcome Home (Sanitarium),Disposable Heroes,Leper Messiah,Orion,"Damage, Inc.",,,...,,,,,,,,,,
4,Helpless [Diamond Head cover],The Small Hours [Holocaust cover],The Wait [Killing Joke cover],Crash Course In Brain Surgery [Budgie cover],Last Caress / Green Hell [The Misfits cover],,,,,,...,,,,,,,,,,
5,Harvester Of Sorrow,Breadfan [Budgie cover],The Prince [Diamond Head cover],,,,,,,,...,,,,,,,,,,
...And Justice For All,Blackened,...And Justice For All,Eye Of The Beholder,One,The Shortest Straw,Harvester Of Sorrow,The Frayed Ends Of Sanity,To Live Is To Die,Dyers Eve,The Prince [Japanese bonus],...,,,,,,,,,,
Metallica,Enter Sandman,Sad But True,Holier Than Thou,The Unforgiven,Wherever I May Roam,Don't Tread On Me,Through The Never,Nothing Else Matters,Of Wolf And Man,The God That Failed,...,,,,,,,,,,
8,Enter Sandman [live],Sad But True [live],Nothing Else Matters [live],,,,,,,,...,,,,,,,,,,
Load,Ain't My Bitch,X,The House Jack Built,Until It Sleeps,King Nothing,Hero Of The Day,Bleeding Me,Cure,Poor Twisted Me,Wasting My Hate,...,,,,,,,,,,


In [58]:
#let's dump the non-studio albums from the data frame
clean_tracks = clean_tracks.T[["Kill 'em All",'Ride The Lightning','Master of Puppets','...And Justice For All',
                               'Metallica','Load','Re-Load','Garage Inc.','St.Anger',
                               'Death Magnetic','Hardwired...To Self Destruct']]

In [59]:
#One of the albums is an album of covers and each track has some more information about the band they covered. I don't need this, let
#us get rid of this using regex
def nobracket(x):
    return re.sub(r'\[.*\]', '', x)
temp = [nobracket(i) for i in (list(clean_tracks['Garage Inc.'].iloc[1:36]))]

In [60]:
#let's create seperate lists for albums, using iloc to get rid of the empty rows 

kea = list(clean_tracks["Kill 'em All"].iloc[0:10])
rtl = list(clean_tracks["Ride The Lightning"].iloc[0:8])
mop = list(clean_tracks["Master of Puppets"].iloc[0:8])
ajfa = list(clean_tracks["...And Justice For All"].iloc[0:9])
met = list(clean_tracks["Metallica"].iloc[0:12])
load = list(clean_tracks["Load"].iloc[0:14])
reload = list(clean_tracks["Re-Load"].iloc[0:13])
ginc = temp[0:7]+temp[13:17]+temp[19:37]
st = list(clean_tracks["St.Anger"].iloc[1:12])
dm = list(clean_tracks["Death Magnetic"].iloc[1:11])
hsd_1 = list(clean_tracks["Hardwired...To Self Destruct"].iloc[1:7])
hsd_2 = list(clean_tracks["Hardwired...To Self Destruct"].iloc[9:15])
hsd = hsd_1+hsd_2

In [61]:
#a couple of minor tweaks album by album
load.remove(' X ')
load.insert(1,'2X4')
reload.remove('The Memory Remains [featMarianne Faithfull]')
reload.insert(1,'The Memory Remains')
reload.remove("Low Man's Lyric [featBernardo Bigalli & David Miles]")
reload.insert(10,"Low Man's Lyric")
st.remove('StAnger')
st.insert(1,'St.Anger')

In [62]:
#let's get rid of white space
ginc = [track.strip() for track in ginc]

In [63]:
#creating a list of list of the albums and their corresponding tracks
clean_tracks = [kea,rtl,mop,ajfa,met,load,reload,ginc,st,dm,hsd]

In [64]:
#looking good
clean_tracks[0:2]

[['Hit The Lights',
  'The Four Horsemen',
  'Motorbreath',
  'Jump In The Fire',
  '(Anesthesia)-Pulling Teeth',
  'Whiplash',
  'Phantom Lord',
  'No Remorse',
  'Seek & Destroy',
  'Metal Militia'],
 ['Fight Fire With Fire',
  'Ride The Lightning',
  'For Whom The Bell Tolls',
  'Fade To Black',
  'Trapped Under Ice',
  'Escape',
  'Creeping Death',
  'The Call Of Ktulu']]

In [65]:
#now, beforew we create the dictionary, let's get rid of the albums we don't need
albums.remove('Creeping Death [EP]') 
albums.remove('Garage Days Re-Revisited [EP]')
albums.remove('Harvester Of Sorrow [EP]')
albums.remove('Live At Wembley Stadium [EP]')
albums.remove('S&M [Live]')
albums.remove('The Unnamed Feeling [EP]')
albums.remove('Some Kind Of Monster [EP]')
albums.remove('Live From Live Earth [EP]') 
albums.remove('Six Feet Down Under [EP]')
albums.remove('Six Feet Down Under Part II [EP]')
albums.remove("Live At Grimey's [EP]")
albums.remove('Lulu [Collaboration]')
albums.remove('Beyond Magnetic [EP]')
albums.remove('Through The Never [Live]')
albums.remove('Liberté, Égalité, Fraternité, Metallica! [Live]')


In [66]:
#we now have a list of albums and a list of their corresponding tracks which we can zip together into a dictionary
met_dict = {}
met_dict = dict(zip(albums,clean_tracks))

In [67]:
#all the studio albums as keys
met_dict.keys()

dict_keys(["Kill 'Em All", 'Ride The Lightning', 'Master Of Puppets', '...And Justice For All', 'Metallica', 'Load', 'Re-Load', 'Garage Inc.', 'St. Anger', 'Death Magnetic', 'Hardwired... To Self-Destruct'])

In [68]:
#all the tracks as lists for each key(album)
met_dict.values()

dict_values([['Hit The Lights', 'The Four Horsemen', 'Motorbreath', 'Jump In The Fire', '(Anesthesia)-Pulling Teeth', 'Whiplash', 'Phantom Lord', 'No Remorse', 'Seek & Destroy', 'Metal Militia'], ['Fight Fire With Fire', 'Ride The Lightning', 'For Whom The Bell Tolls', 'Fade To Black', 'Trapped Under Ice', 'Escape', 'Creeping Death', 'The Call Of Ktulu'], ['Battery', 'Master Of Puppets', 'The Thing That Should Not Be', 'Welcome Home (Sanitarium)', 'Disposable Heroes', 'Leper Messiah', 'Orion', 'Damage, Inc.'], ['Blackened', '...And Justice For All', 'Eye Of The Beholder', 'One', 'The Shortest Straw', 'Harvester Of Sorrow', 'The Frayed Ends Of Sanity', 'To Live Is To Die', 'Dyers Eve'], ['Enter Sandman', 'Sad But True', 'Holier Than Thou', 'The Unforgiven', 'Wherever I May Roam', "Don't Tread On Me", 'Through The Never', 'Nothing Else Matters', 'Of Wolf And Man', 'The God That Failed', 'My Friend Of Misery', 'The Struggle Within'], ["Ain't My Bitch", '2X4', 'The House Jack Built', 'Unti

In [70]:
#let's see if it works - let's get the tracks for Master of Puppets... HELL YEAH. It does.
met_dict['Master Of Puppets']

['Battery',
 'Master Of Puppets',
 'The Thing That Should Not Be',
 'Welcome Home (Sanitarium)',
 'Disposable Heroes',
 'Leper Messiah',
 'Orion',
 'Damage, Inc.']

In [73]:
#let's save the Metallica discography dictionary down so we can use it later on
np.save('metallica_discography.npy', met_dict) 

## Next Steps