## Scraping non-tabular, multipage sites
Scrape the top 500 <a href="https://bestsellingalbums.org/decade/2010">best-selling albums of the 2010's</a>. Your data must include the following datapoints:

- Name of album
- Name of artist
- Number of albums sold 
- The link to the page that breaks down sales by country (found by clicking album title)



In [1]:
## create cells as needed
## import libraries

import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
url = "https://bestsellingalbums.org/decade/2010"
response = requests.get(url)

In [5]:
response.status_code

200

In [7]:
type(response)

requests.models.Response

In [13]:
soup = BeautifulSoup(response.text, "html.parser")
soup


<!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<link href="https://bestsellingalbums.org/xmlrpc.php" rel="pingback"/>
<!--[if lt IE 9]>
    <script src="https://bestsellingalbums.org/wp-content/themes/twentyfifteen/js/html5.js?ver=3.7.0"></script>
    <![endif]-->
<script>(function(html){html.className = html.className.replace(/\bno-js\b/,'js')})(document.documentElement);</script>
<!-- This site is optimized with the Yoast SEO plugin v14.5 - https://yoast.com/wordpress/plugins/seo/ -->
<title>Best-selling albums of 2010's</title>
<meta content="Best-selling albums of 2010's" name="description">
<meta content="index, follow" name="robots">
<meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="googlebot"/>
<meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name=

In [17]:
type(soup)

bs4.BeautifulSoup

In [35]:
## Finding album names on page 1
album_names = soup.find_all("div", class_="album")
album_names

[<div class="album"><a href="https://bestsellingalbums.org/album/1034">21</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/1035">25</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/30524">CHRISTMAS</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/45488">1989</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/23318">PURPOSE</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/12876">DIVIDE</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/42961">FROZEN</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/23977">TEENAGE DREAM</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/12880">X</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/6777">DOO-WOPS &amp; HOOLIGANS</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/13756">RECOVERY</a></div

In [45]:
##Isolate the album titles from everything else
##Hold them in a list for later
albumnames_fl = []
for album in album_names:
    albumnames_fl.append(album.get_text())
    
albumnames_fl

['21',
 '25',
 'CHRISTMAS',
 '1989',
 'PURPOSE',
 'DIVIDE',
 'FROZEN',
 'TEENAGE DREAM',
 'X',
 'DOO-WOPS & HOOLIGANS',
 'RECOVERY',
 'NIGHT VISIONS',
 'IN THE LONELY HOUR',
 'UNORTHODOX JUKEBOX',
 'RED',
 '+',
 'VIEWS',
 'BEAUTY BEHIND THE MADNESS',
 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?',
 'BORN THIS WAY',
 'MAP OF THE SOUL: 7',
 'BEERBONGS & BENTLEYS',
 'TAKE CARE',
 'SPEAK NOW',
 'PRISM',
 'BORN TO DIE',
 'LOUD',
 'ANTI',
 'BLURRYFACE',
 "HOLLYWOOD'S BLEEDING",
 'SCORPION',
 'STONEY',
 'TAKE ME HOME',
 'THE GREATEST SHOWMAN',
 'BEYONCÉ',
 'THE TRUTH ABOUT LOVE',
 'REPUTATION',
 '?',
 'TRAVELLER',
 'STARBOY',
 'UP ALL NIGHT',
 'MIDNIGHT MEMORIES',
 'MAP OF THE SOUL: PERSONA',
 'GOODBYE & GOOD RIDDANCE',
 'A HEAD FULL OF DREAMS',
 'THE HEIST',
 'THE MARSHALL MATHERS LP 2',
 'LOVER',
 'WATCH THE THRONE',
 "THIS ONE'S FOR YOU"]

In [47]:
##Scraping artist names
artist_names = soup.find_all("div", class_="artist")
artist_names

[<div class="artist"><a href="https://bestsellingalbums.org/artist/218" title="ADELE album sales">ADELE</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/218" title="ADELE album sales">ADELE</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/8822" title="MICHAEL BUBLÉ album sales">MICHAEL BUBLÉ</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/12748" title="TAYLOR SWIFT album sales">TAYLOR SWIFT</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/6646" title="JUSTIN BIEBER album sales">JUSTIN BIEBER</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/3645" title="ED SHEERAN album sales">ED SHEERAN</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/12207" title="SOUNDTRACK album sales">SOUNDTRACK</a></div>,
 <div class="artist"><a href="https://bestsellingalbums.org/artist/6828" title="KATY PERRY album sales">KATY PERRY</a></di

In [49]:
##Separate artist names and put in list
artistnames_fl = []
for artist in artist_names:
    artistnames_fl.append(artist.get_text())
    
artistnames_fl

['ADELE',
 'ADELE',
 'MICHAEL BUBLÉ',
 'TAYLOR SWIFT',
 'JUSTIN BIEBER',
 'ED SHEERAN',
 'SOUNDTRACK',
 'KATY PERRY',
 'ED SHEERAN',
 'BRUNO MARS',
 'EMINEM',
 'IMAGINE DRAGONS',
 'SAM SMITH',
 'BRUNO MARS',
 'TAYLOR SWIFT',
 'ED SHEERAN',
 'DRAKE',
 'THE WEEKND',
 'BILLIE EILISH',
 'LADY GAGA',
 'BTS (방탄소년단)',
 'POST MALONE',
 'DRAKE',
 'TAYLOR SWIFT',
 'KATY PERRY',
 'LANA DEL REY',
 'RIHANNA',
 'RIHANNA',
 'TWENTY ONE PILOTS',
 'POST MALONE',
 'DRAKE',
 'POST MALONE',
 'ONE DIRECTION',
 'SOUNDTRACK',
 'BEYONCÉ',
 'P!NK',
 'TAYLOR SWIFT',
 'XXXTENTACION',
 'CHRIS STAPLETON',
 'THE WEEKND',
 'ONE DIRECTION',
 'ONE DIRECTION',
 'BTS (방탄소년단)',
 'JUICE WRLD',
 'COLDPLAY',
 'MACKLEMORE & RYAN LEWIS',
 'EMINEM',
 'TAYLOR SWIFT',
 'JAY-Z & KANYE WEST',
 'LUKE COMBS']

In [51]:
##Album sales
album_sales = soup.find_all("div", class_="sales")
album_sales

[<div class="sales">Sales: 30,000,000</div>,
 <div class="sales">Sales: 23,000,000</div>,
 <div class="sales">Sales: 15,000,000</div>,
 <div class="sales">Sales: 14,748,116</div>,
 <div class="sales">Sales: 14,000,000</div>,
 <div class="sales">Sales: 13,787,460</div>,
 <div class="sales">Sales: 12,632,083</div>,
 <div class="sales">Sales: 12,134,000</div>,
 <div class="sales">Sales: 11,879,785</div>,
 <div class="sales">Sales: 11,270,000</div>,
 <div class="sales">Sales: 10,873,795</div>,
 <div class="sales">Sales: 9,616,263</div>,
 <div class="sales">Sales: 9,321,352</div>,
 <div class="sales">Sales: 8,976,749</div>,
 <div class="sales">Sales: 8,889,124</div>,
 <div class="sales">Sales: 7,705,000</div>,
 <div class="sales">Sales: 7,687,247</div>,
 <div class="sales">Sales: 7,584,588</div>,
 <div class="sales">Sales: 7,256,516</div>,
 <div class="sales">Sales: 7,166,944</div>,
 <div class="sales">Sales: 7,130,621</div>,
 <div class="sales">Sales: 7,116,118</div>,
 <div class="sales">S

In [53]:
## Separate album sales and put in list
albumsales_fl = []
for sales in album_sales:
    albumsales_fl.append(sales.get_text())
    
albumsales_fl

['Sales: 30,000,000',
 'Sales: 23,000,000',
 'Sales: 15,000,000',
 'Sales: 14,748,116',
 'Sales: 14,000,000',
 'Sales: 13,787,460',
 'Sales: 12,632,083',
 'Sales: 12,134,000',
 'Sales: 11,879,785',
 'Sales: 11,270,000',
 'Sales: 10,873,795',
 'Sales: 9,616,263',
 'Sales: 9,321,352',
 'Sales: 8,976,749',
 'Sales: 8,889,124',
 'Sales: 7,705,000',
 'Sales: 7,687,247',
 'Sales: 7,584,588',
 'Sales: 7,256,516',
 'Sales: 7,166,944',
 'Sales: 7,130,621',
 'Sales: 7,116,118',
 'Sales: 6,920,000',
 'Sales: 6,917,500',
 'Sales: 6,692,500',
 'Sales: 6,674,983',
 'Sales: 6,673,000',
 'Sales: 6,537,235',
 'Sales: 6,500,000',
 'Sales: 6,461,665',
 'Sales: 6,433,983',
 'Sales: 6,371,355',
 'Sales: 6,334,619',
 'Sales: 6,318,119',
 'Sales: 6,290,833',
 'Sales: 6,231,084',
 'Sales: 6,186,524',
 'Sales: 6,182,852',
 'Sales: 6,157,000',
 'Sales: 6,070,666',
 'Sales: 6,046,188',
 'Sales: 6,020,087',
 'Sales: 6,010,031',
 'Sales: 6,002,713',
 'Sales: 6,000,000',
 'Sales: 5,858,500',
 'Sales: 5,790,318',
 '

In [82]:
countrysales_fl = []
for url in album_names:
    countrysales_fl.append(url.find('a').get("href"))
    
countrysales_fl

['https://bestsellingalbums.org/album/1034',
 'https://bestsellingalbums.org/album/1035',
 'https://bestsellingalbums.org/album/30524',
 'https://bestsellingalbums.org/album/45488',
 'https://bestsellingalbums.org/album/23318',
 'https://bestsellingalbums.org/album/12876',
 'https://bestsellingalbums.org/album/42961',
 'https://bestsellingalbums.org/album/23977',
 'https://bestsellingalbums.org/album/12880',
 'https://bestsellingalbums.org/album/6777',
 'https://bestsellingalbums.org/album/13756',
 'https://bestsellingalbums.org/album/19810',
 'https://bestsellingalbums.org/album/39978',
 'https://bestsellingalbums.org/album/6778',
 'https://bestsellingalbums.org/album/45494',
 'https://bestsellingalbums.org/album/12875',
 'https://bestsellingalbums.org/album/12457',
 'https://bestsellingalbums.org/album/47839',
 'https://bestsellingalbums.org/album/5207',
 'https://bestsellingalbums.org/album/25786',
 'https://bestsellingalbums.org/album/6859',
 'https://bestsellingalbums.org/album/36

In [84]:
from random import randrange
import time

In [86]:
len(countrysales_fl)

50

In [88]:
top_albums = []
for albumdata in zip(albumnames_fl, artistnames_fl, albumsales_fl, countrysales_fl):
    top_albums.append(albumdata)
top_albums

[('21',
  'ADELE',
  'Sales: 30,000,000',
  'https://bestsellingalbums.org/album/1034'),
 ('25',
  'ADELE',
  'Sales: 23,000,000',
  'https://bestsellingalbums.org/album/1035'),
 ('CHRISTMAS',
  'MICHAEL BUBLÉ',
  'Sales: 15,000,000',
  'https://bestsellingalbums.org/album/30524'),
 ('1989',
  'TAYLOR SWIFT',
  'Sales: 14,748,116',
  'https://bestsellingalbums.org/album/45488'),
 ('PURPOSE',
  'JUSTIN BIEBER',
  'Sales: 14,000,000',
  'https://bestsellingalbums.org/album/23318'),
 ('DIVIDE',
  'ED SHEERAN',
  'Sales: 13,787,460',
  'https://bestsellingalbums.org/album/12876'),
 ('FROZEN',
  'SOUNDTRACK',
  'Sales: 12,632,083',
  'https://bestsellingalbums.org/album/42961'),
 ('TEENAGE DREAM',
  'KATY PERRY',
  'Sales: 12,134,000',
  'https://bestsellingalbums.org/album/23977'),
 ('X',
  'ED SHEERAN',
  'Sales: 11,879,785',
  'https://bestsellingalbums.org/album/12880'),
 ('DOO-WOPS & HOOLIGANS',
  'BRUNO MARS',
  'Sales: 11,270,000',
  'https://bestsellingalbums.org/album/6777'),
 ('RE

In [90]:
df = pd.DataFrame(top_albums)
df.columns = ["Title", "Artist", "Sales", "Link"]
df

Unnamed: 0,Title,Artist,Sales,Link
0,21,ADELE,"Sales: 30,000,000",https://bestsellingalbums.org/album/1034
1,25,ADELE,"Sales: 23,000,000",https://bestsellingalbums.org/album/1035
2,CHRISTMAS,MICHAEL BUBLÉ,"Sales: 15,000,000",https://bestsellingalbums.org/album/30524
3,1989,TAYLOR SWIFT,"Sales: 14,748,116",https://bestsellingalbums.org/album/45488
4,PURPOSE,JUSTIN BIEBER,"Sales: 14,000,000",https://bestsellingalbums.org/album/23318
5,DIVIDE,ED SHEERAN,"Sales: 13,787,460",https://bestsellingalbums.org/album/12876
6,FROZEN,SOUNDTRACK,"Sales: 12,632,083",https://bestsellingalbums.org/album/42961
7,TEENAGE DREAM,KATY PERRY,"Sales: 12,134,000",https://bestsellingalbums.org/album/23977
8,X,ED SHEERAN,"Sales: 11,879,785",https://bestsellingalbums.org/album/12880
9,DOO-WOPS & HOOLIGANS,BRUNO MARS,"Sales: 11,270,000",https://bestsellingalbums.org/album/6777


In [92]:
df.to_csv("albumdata1.csv", encoding = "UTF-8", index = False)