# Retrieving RIAA Award-Winning Artists
Original Date of Extracted Elements: 9/22/18

In [2]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import requests
import splinter
import selenium
import time

%matplotlib inline

### Using Requests & BeautifulSoup

In [16]:
url = 'https://www.riaa.com/gold-platinum/?tab_active=awards_by_artist#search_section'
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0'}
res = requests.get(url, headers=headers)

In [1]:
# res.text

**Verdict:** Can't figure out a way to interact with the "Load More" button to retrieve more rows. I'll give Splinter a shot

### Using Splinter

In [3]:
from splinter import Browser
browse = Browser()
browse.visit('https://www.riaa.com/gold-platinum/?tab_active=awards_by_artist&col=artist&ord=asc#search_section')

**Found the "Load More" button. This generates more table rows**

In [4]:
load_more = browse.find_by_xpath('//*[@id="loadmore"]')
load_more.click()

**Testing `find_by` for Table Elements**

In [1]:
# [element.value for element in all_tables]

**Extracting all Artists in RIAA table**

In [13]:
# for i in range(200):
#     load_more.click()
#     time.sleep(2)

**Extracting all artist information by table row**

In [57]:
all_tables = browse.find_by_tag("tr")
artist_list = [element.value for element in all_tables]

In [60]:
len(artist_list)

2515

**Extracting all artist information by `td class=artists_cell`**

In [14]:
all_tds = browse.find_by_tag('td[class="artists_cell"]')
artist_list_td = [element.value for element in all_tds]

In [16]:
len(artist_list_td)

2514

Pickling Lists

In [63]:
with open('./pickle/artist_list.pkl', 'wb+') as f:
    pickle.dump(artist_list, f)

In [17]:
with open('./pickle/artist_list_td.pkl', 'wb+') as f:
    pickle.dump(artist_list_td, f)

Looks like there's a total of 2515 entries on the artist listing. There were 2547 nodes representing the table within the Firefox Inspector. I wonder if that means we're missing a few entries (32).

Update (9/29/18): I added this `artist_list_td`. I'm not sure if it's the same, but the length is 2514. This list will be much easier to query the spotify api with.

### Tossing Artist Listing into Array

In [64]:
artist_list[:5]

['  ARTIST CERTIFIED UNITS\n(In Millions) GOLD ® PLATINUM ® MULTI-PLATINUM ™ DIAMOND ® TYPE',
 "SHARE\n'N SYNC 28 4 4 4 2 Standard\nMORE DETAILS",
 'SHARE\n"WEIRD AL" YANKOVIC 8 10 6 0 0 All types\nMORE DETAILS',
 'SHARE\n10 YEARS 0.5 1 0 0 0 Standard\nMORE DETAILS',
 'SHARE\n10,000 MANIACS 8 4 4 3 0 Standard\nMORE DETAILS']

### Retreiving Pickle

In [4]:
with open('./pickle/artist_list.pkl', 'rb+') as f:
    artist_list = pickle.load(f)

### Cleaning Artist List

In [5]:
artist_list[:7]

['  ARTIST CERTIFIED UNITS\n(In Millions) GOLD ® PLATINUM ® MULTI-PLATINUM ™ DIAMOND ® TYPE',
 "SHARE\n'N SYNC 28 4 4 4 2 Standard\nMORE DETAILS",
 'SHARE\n"WEIRD AL" YANKOVIC 8 10 6 0 0 All types\nMORE DETAILS',
 'SHARE\n10 YEARS 0.5 1 0 0 0 Standard\nMORE DETAILS',
 'SHARE\n10,000 MANIACS 8 4 4 3 0 Standard\nMORE DETAILS',
 'SHARE\n112 5.5 4 3 2 0 Standard\nMORE DETAILS',
 'SHARE\n2 CHAINZ 1.5 2 1 0 0 All types\nMORE DETAILS']

In [6]:
artist_list[5]

'SHARE\n112 5.5 4 3 2 0 Standard\nMORE DETAILS'

In [7]:
artist_list[49]

'SHARE\nACE OF BASE 10 2 2 1 0 Standard\nMORE DETAILS'

In [8]:
del artist_list[0]

In [9]:
artist_list = [re.findall(r"[^(SHARE)\\n]\w+", element) for element in artist_list]

In [10]:
artist_list[0]

["'N",
 ' SYNC',
 ' 28',
 ' 4',
 ' 4',
 ' 4',
 ' 2',
 ' Standard',
 '\nMORE',
 ' DETAILS']

**Removing Spaces**

In [11]:
artist_list = [[element.lstrip() for element in entry] for entry in artist_list]

**Casting string numbers to integers**

In [12]:
artist_list_2 = []

for entry in artist_list:
    entry_list = []
    for element in entry:
        try:
            entry_list.append(int(element))
        except:
            try:
                entry_list.append(float(element))
            except:
                entry_list.append(element)
    artist_list_2.append(entry_list)

In [13]:
type(artist_list_2[0][2])

int

In [14]:
artist_list[0][1]

'SYNC'

In [16]:
artist_list_2[:20]

[["'N", 'SYNC', 28, 4, 4, 4, 2, 'Standard', 'MORE', 'DETAILS'],
 ['"WEIRD',
  'AL',
  'YANKOVIC',
  8,
  10,
  6,
  0,
  0,
  'All',
  'types',
  'MORE',
  'DETAILS'],
 [10, 'YEARS', 0, 0.5, 1, 0, 0, 0, 'Standard', 'MORE', 'DETAILS'],
 [10, ',000', 'MANIACS', 8, 4, 4, 3, 0, 'Standard', 'MORE', 'DETAILS'],
 [112, 5, 0.5, 4, 3, 2, 0, 'Standard', 'MORE', 'DETAILS'],
 [2, 'CHAINZ', 1, 0.5, 2, 1, 0, 0, 'All', 'types', 'MORE', 'DETAILS'],
 [2, 'LIVE', 'CREW', 3, 5, 1, 0, 0, 'Standard', 'MORE', 'DETAILS'],
 [2, 'PAC', 36, 0.5, 10, 9, 6, 2, 'Standard', 'MORE', 'DETAILS'],
 [2, 'PAC', 'OUTLAWZ', 1, 1, 1, 0, 0, 'Standard', 'MORE', 'DETAILS'],
 [2, 'UNLIMITED', 0, 0.5, 1, 0, 0, 0, 'Standard', 'MORE', 'DETAILS'],
 [21, 'SAVAGE', 0, 0.5, 1, 0, 0, 0, 'All', 'types', 'MORE', 'DETAILS'],
 [21,
  'SAVAGE',
  'METRO',
  'BOOMIN',
  0,
  0.5,
  1,
  0,
  0,
  0,
  'All',
  'types',
  'MORE',
  'DETAILS'],
 [3, 'DOORS', 'DOWN', 12, 5, 3, 2, 0, 'All', 'types', 'MORE', 'DETAILS'],
 [30,
  'SECONDS',
  'TO',

**Getting Rid of Words at the End of List**

In [284]:
artist_list_2 = [entry[:-3] for entry in artist_list_2]

In [285]:
artist_list_3 = list(map(lambda x: x[:-1] if x[-1] == 'All' else x, artist_list_2))

In [287]:
artist_list_3[:5]

[["'N", 'SYNC', 28, 4, 4, 4, 2],
 ['"WEIRD', 'AL', 'YANKOVIC', 8, 10, 6, 0, 0],
 [10, 'YEARS', 0, 0.5, 1, 0, 0, 0],
 [10, ',000', 'MANIACS', 8, 4, 4, 3, 0],
 [112, 5, 0.5, 4, 3, 2, 0]]

In [288]:
artist_list_3[0][-1]

2

**Extracting Elements to Make df**

In [291]:
diamond = []
multi_platinum = []
platinum = []
gold = []
units = []
for entry in artist_list_3:
    diamond.append(entry[-1])
    multi_platinum.append(entry[-2])
    platinum.append(entry[-3])
    gold.append(entry[-4])
    if isinstance(entry[-5], float):
        entry[-5] = entry[-6] + entry[-5]
        units.append(entry[-5])
        del entry[-6]
    else:
        units.append(entry[-5])

In [306]:
artist = []
for entry in artist_list_3:
    artist.append(entry[:-5])

In [308]:
with open('./pickle/artist.pkl', 'wb+') as f:
    pickle.dump(artist, f)

**Pickling Elements of df**

In [293]:
with open('./pickle/diamond.pkl', 'wb+') as f:
    pickle.dump(diamond, f)
with open('./pickle/multi_platinum.pkl', 'wb+') as f:
    pickle.dump(multi_platinum, f)
with open('./pickle/platinum.pkl', 'wb+') as f:
    pickle.dump(platinum, f)
with open('./pickle/gold.pkl', 'wb+') as f:
    pickle.dump(gold, f)
with open('./pickle/units.pkl', 'wb+') as f:
    pickle.dump(units, f)
with open('./pickle/artist_list_3.pkl', 'wb+') as f:
    pickle.dump(artist_list_3, f)

**Retreiving Pickle Elements** 

In [18]:
with open('./pickle/artist.pkl', 'rb+') as f:
    artist = pickle.load(f)

In [20]:
artist

[["'N", 'SYNC'],
 ['"WEIRD', 'AL', 'YANKOVIC'],
 [10, 'YEARS'],
 [10, ',000', 'MANIACS'],
 [112],
 [2, 'CHAINZ'],
 [2, 'LIVE', 'CREW'],
 [2, 'PAC'],
 [2, 'PAC', 'OUTLAWZ'],
 [2, 'UNLIMITED'],
 [21, 'SAVAGE'],
 [21, 'SAVAGE', 'METRO', 'BOOMIN'],
 [3, 'DOORS', 'DOWN'],
 [30, 'SECONDS', 'TO', 'MARS'],
 [311],
 ['3LW'],
 ['3RD', 'BASS'],
 [4, 'HIM'],
 [4, 'NON', 'BLONDES'],
 [5, 'SECONDS', 'OF', 'SUMMER'],
 [50, 'CENT'],
 [504, 'BOYZ'],
 [69, 'BOYZ'],
 ['6IX9INE'],
 ['6LACK'],
 [702],
 [8, 'BALL', 'MJG'],
 [98, 'DEGREES'],
 ['A', 'BOOGIE', 'WIT', 'DA', 'HOODIE'],
 ['A', 'DAY', 'TO', 'REMEMBER'],
 ['A', 'FLOCK', 'OF', 'SEAGULLS'],
 ['A', 'GREAT', 'BIG', 'WORLD'],
 ['A', 'PERFECT', 'CIRCLE'],
 ['A', 'TASTE', 'OF', 'HONEY'],
 ['A', 'TRIBE', 'CALLED', 'QUEST'],
 ['A', '-HA'],
 ['A', 'DORATI', 'MINNEAPOLIS', 'SYMPHONY'],
 ['A', '$AP', 'ROCKY'],
 ['AALIYAH'],
 ['AARON', 'CARTER'],
 ['AARON', 'HALL'],
 ['AARON', 'NEVILLE'],
 ['AARON', 'TIPPIN'],
 ['ABBA'],
 ['ABC'],
 ['AC', '/DC'],
 ['ACCEPT'],
 

In [8]:
list(map())

'["\'N", \'SYNC\'], [\'"WEIRD\', \'AL\', \'YANKOVIC\'], [10, \'YEARS\'], [10, \',000\', \'MANIACS\'], [112], [2, \'CHAINZ\'], [2, \'LIVE\', \'CREW\'], [2, \'PAC\'], [2, \'PAC\', \'OUTLAWZ\'], [2, \'UNLIMITED\']'