In [68]:
#Load setup
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
from tqdm import tqdm
pd.set_option('display.max_rows', None)

In [2]:
#URL we want to use for web scraping
url = "https://www.billboard.com/charts/hot-100"

In [3]:
#Download html code and check whether connection works
requests.get(url).status_code

200

In [4]:
r = requests.get(url)

In [5]:
soup = bs(r.content, parser="html5lib")

### Create list with rank of songs

In [34]:
rank_help = soup.find_all('span', {'class' : 'chart-element__rank__number'})

In [33]:
ranks = [int(r.text) for r in rank_help]

In [11]:
ranks = list(set(ranks))

### Create list with title of songs

In [32]:
song_help = soup.find_all('span', {'class' : 'chart-element__information__song text--truncate color--primary'})
songs = [str(r.text) for r in song_help]

### Create list with names of artists

In [30]:
artist_help = soup.find_all('span', {'class' : 'chart-element__information__artist text--truncate color--secondary'})
artist_help
artists = [str(r.text) for r in artist_help]

### Create a dataframe out of list objects

In [15]:
dict_billboard100 = {'rank': ranks,
                 'song': songs,
                 'artist': artists}

billboard100 = pd.DataFrame(dict_billboard100)
billboard100

Unnamed: 0,rank,song,artist
0,1,Life Goes On,BTS
1,2,Mood,24kGoldn Featuring iann dior
2,3,Dynamite,BTS
3,4,Positions,Ariana Grande
4,5,I Hope,Gabby Barrett Featuring Charlie Puth
...,...,...,...
95,96,Popstar,DJ Khaled Featuring Drake
96,97,Bichota,Karol G
97,98,Happy Does,Kenny Chesney
98,99,Cover Me Up,Morgan Wallen


## Billboard Top Lists

In [160]:
#URLs we want to use for web scraping

dict_billboards = {'Billboard Hot 100': "https://www.billboard.com/charts/hot-100",
                   'Billboard Global 200': "https://www.billboard.com/charts/billboard-global-200",
                   'Billboard Global Excluding USA': "https://www.billboard.com/charts/billboard-global-excl-us",
                   #'Billboard Year-End 2019 Hot 100': "https://www.billboard.com/charts/year-end/2019/hot-100-songs",
                   #'Billboard Decade-End Hot 100 ': "https://www.billboard.com/charts/decade-end/hot-100",
                  }

In [161]:
#Script for scraping all billboard lists

ranks = []
songs = []
artists = []
source = []

for src, url in dict_billboards.items():
    print(f"{url} - Status code: {requests.get(url).status_code}")
    r = requests.get(url)
    soup = bs(r.content, parser="html5lib")
    ranks_help = [int(r.text) for r in soup.find_all('span', {'class' : 'chart-element__rank__number'})]
    ranks.extend(ranks_help)
    songs_help = [str(r.text) for r in soup.find_all('span', {'class' : 'chart-element__information__song text--truncate color--primary'})]
    songs.extend(songs_help)
    artists_help = [str(r.text) for r in soup.find_all('span', {'class' : 'chart-element__information__artist text--truncate color--secondary'})]
    artists.extend(artists_help)
    source_help = [src for n in range(1, len(ranks_help)+1)]
    source.extend(source_help)


dict_global = {'song': songs,
               'artist': artists,
               'source': source}

billboard = pd.DataFrame(dict_global)
billboard

https://www.billboard.com/charts/hot-100 - Status code: 200
https://www.billboard.com/charts/billboard-global-200 - Status code: 200
https://www.billboard.com/charts/billboard-global-excl-us - Status code: 200


Unnamed: 0,song,artist,source
0,Life Goes On,BTS,Billboard Hot 100
1,Mood,24kGoldn Featuring iann dior,Billboard Hot 100
2,Dynamite,BTS,Billboard Hot 100
3,Positions,Ariana Grande,Billboard Hot 100
4,I Hope,Gabby Barrett Featuring Charlie Puth,Billboard Hot 100
5,Holy,Justin Bieber Featuring Chance The Rapper,Billboard Hot 100
6,Laugh Now Cry Later,Drake Featuring Lil Durk,Billboard Hot 100
7,Monster,Shawn Mendes & Justin Bieber,Billboard Hot 100
8,Blinding Lights,The Weeknd,Billboard Hot 100
9,Lemonade,Internet Money & Gunna Featuring Don Toliver &...,Billboard Hot 100


In [162]:
len(source)

500

In [163]:
billboard

Unnamed: 0,song,artist,source
0,Life Goes On,BTS,Billboard Hot 100
1,Mood,24kGoldn Featuring iann dior,Billboard Hot 100
2,Dynamite,BTS,Billboard Hot 100
3,Positions,Ariana Grande,Billboard Hot 100
4,I Hope,Gabby Barrett Featuring Charlie Puth,Billboard Hot 100
5,Holy,Justin Bieber Featuring Chance The Rapper,Billboard Hot 100
6,Laugh Now Cry Later,Drake Featuring Lil Durk,Billboard Hot 100
7,Monster,Shawn Mendes & Justin Bieber,Billboard Hot 100
8,Blinding Lights,The Weeknd,Billboard Hot 100
9,Lemonade,Internet Money & Gunna Featuring Don Toliver &...,Billboard Hot 100


### Drop duplicates

In [166]:
billboard = billboard.drop_duplicates(subset=['song', 'artist'], keep='first')
billboard.shape

(272, 3)

## Deutsche Charts

Work in Progress...

In [70]:
#URL we want to use for web scraping
url = "https://www.offiziellecharts.de/charts/single/for-date-1606431600000"

In [92]:
#Download html code and check whether connection works
requests.get(url).status_code

200

In [78]:
r = requests.get(url)

In [89]:
soup = bs(r.content, parser="html5lib")

### Create list with rank of songs

In [96]:
rank_help = soup.find_all('div', {'class': 'wrap'})
rank_help

[]

In [82]:
ranks = [int(r.text) for r in rank_help]
ranks

[]

In [24]:
ranks = list(set(ranks))

In [107]:
soup.find('td', {'class': 'ch-pos'})

## Get list of eurodance top hits from Wikipedia

In [145]:
eurodance = pd.read_html("https://en.wikipedia.org/wiki/List_of_Eurodance_songs")

In [146]:
len(eurodance)

5

In [147]:
eurodancewiki = eurodance[0].append([eurodance[1], eurodance[2], eurodance[3], eurodance[4]], ignore_index=True)

In [148]:
eurodancewiki

Unnamed: 0,Year,Artist,Origin,Song
0,1989,Black Box,Italy,"""Ride On Time""[1]"
1,1989,Technotronic,Belgium,"""Pump Up the Jam""[2][3]"
2,1990,Snap!,Germany,"""The Power""[4]"
3,1991,2 Unlimited,The Netherlands,"""Get Ready for This""[5]"
4,1991,Army of Lovers,Sweden,"""Crucified""[6]"
5,1991,Black Box,Italy,"""Strike It Up""[5]"
6,1992,2 Unlimited,The Netherlands,"""Twilight Zone""[7]"
7,1992,Army of Lovers,Sweden,"""Ride the Bullet""[6]"
8,1992,Captain Hollywood Project,Germany,"""More and More""[8]"
9,1992,DJ BoBo,Switzerland,"""Somebody Dance with Me""[9]"


In [149]:
eurodancewiki['source'] = "Wikipedia List of Eurodance Songs"

In [150]:
eurodancewiki

Unnamed: 0,Year,Artist,Origin,Song,source
0,1989,Black Box,Italy,"""Ride On Time""[1]",Wikipedia List of Eurodance Songs
1,1989,Technotronic,Belgium,"""Pump Up the Jam""[2][3]",Wikipedia List of Eurodance Songs
2,1990,Snap!,Germany,"""The Power""[4]",Wikipedia List of Eurodance Songs
3,1991,2 Unlimited,The Netherlands,"""Get Ready for This""[5]",Wikipedia List of Eurodance Songs
4,1991,Army of Lovers,Sweden,"""Crucified""[6]",Wikipedia List of Eurodance Songs
5,1991,Black Box,Italy,"""Strike It Up""[5]",Wikipedia List of Eurodance Songs
6,1992,2 Unlimited,The Netherlands,"""Twilight Zone""[7]",Wikipedia List of Eurodance Songs
7,1992,Army of Lovers,Sweden,"""Ride the Bullet""[6]",Wikipedia List of Eurodance Songs
8,1992,Captain Hollywood Project,Germany,"""More and More""[8]",Wikipedia List of Eurodance Songs
9,1992,DJ BoBo,Switzerland,"""Somebody Dance with Me""[9]",Wikipedia List of Eurodance Songs


In [151]:
eurodancewikicut = eurodancewiki.drop(['Year', 'Origin'], axis=1)

In [152]:
eurodancewikicut

Unnamed: 0,Artist,Song,source
0,Black Box,"""Ride On Time""[1]",Wikipedia List of Eurodance Songs
1,Technotronic,"""Pump Up the Jam""[2][3]",Wikipedia List of Eurodance Songs
2,Snap!,"""The Power""[4]",Wikipedia List of Eurodance Songs
3,2 Unlimited,"""Get Ready for This""[5]",Wikipedia List of Eurodance Songs
4,Army of Lovers,"""Crucified""[6]",Wikipedia List of Eurodance Songs
5,Black Box,"""Strike It Up""[5]",Wikipedia List of Eurodance Songs
6,2 Unlimited,"""Twilight Zone""[7]",Wikipedia List of Eurodance Songs
7,Army of Lovers,"""Ride the Bullet""[6]",Wikipedia List of Eurodance Songs
8,Captain Hollywood Project,"""More and More""[8]",Wikipedia List of Eurodance Songs
9,DJ BoBo,"""Somebody Dance with Me""[9]",Wikipedia List of Eurodance Songs


### change column names to lower case

In [156]:
eurodancewikicut.columns = map(str.lower, eurodancewikicut.columns)

In [157]:
eurodancewikicut.columns.tolist()

['artist', 'song', 'source']

### change order of columns

In [158]:
eurodancewikicut = eurodancewikicut[['song', 'artist', 'source']]

In [159]:
eurodancewikicut

Unnamed: 0,song,artist,source
0,"""Ride On Time""[1]",Black Box,Wikipedia List of Eurodance Songs
1,"""Pump Up the Jam""[2][3]",Technotronic,Wikipedia List of Eurodance Songs
2,"""The Power""[4]",Snap!,Wikipedia List of Eurodance Songs
3,"""Get Ready for This""[5]",2 Unlimited,Wikipedia List of Eurodance Songs
4,"""Crucified""[6]",Army of Lovers,Wikipedia List of Eurodance Songs
5,"""Strike It Up""[5]",Black Box,Wikipedia List of Eurodance Songs
6,"""Twilight Zone""[7]",2 Unlimited,Wikipedia List of Eurodance Songs
7,"""Ride the Bullet""[6]",Army of Lovers,Wikipedia List of Eurodance Songs
8,"""More and More""[8]",Captain Hollywood Project,Wikipedia List of Eurodance Songs
9,"""Somebody Dance with Me""[9]",DJ BoBo,Wikipedia List of Eurodance Songs


## Merge Billboard Top lists with Eurodance top hits from Wikipedia

In [168]:
tophits = billboard.append(eurodancewikicut, ignore_index=True)

In [170]:
tophits.shape

(369, 3)