In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import random


# Lab | Web Scraping Single Page (GNOD part 1)

In [2]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
# soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div ")

In [6]:
# Print titles of the songs
for title_element in soup.find_all(class_='title'):
    print(title_element.get_text())

Lovin On Me
Lil Boo Thang
I Remember Everything (feat. Kacey Musgraves)
White Horse
Need A Favor
Save Me (with Lainey Wilson)
Around Me the Cold Night
90s Rap Mashup
Cruel Summer
Next Winter I Will Wait for You Forever
Standing Next to You
Thinkin’ Bout Me
All I Want for Christmas Is You
Christmas / Sarajevo 12/24 (Instrumental)
3D
greedy
Even Though I Know You
Fast Car
Lose Control
Sin So Sweet
Now And Then
In the Sky a White Cloud Drifted Lazily
Water
Houdini
Standing Next to You (Band Version)
Rockin' Around the Christmas Tree (Single)
Standing Next to You (Future Funk Remix)
Standing Next to You (Slow Jam Remix)
Standing Next to You (Holiday Remix)
Standing Next to You (PBR&B Remix)
Standing Next to You (Latin Trap Remix)
Standing Next to You (Instrumental)
Last Night
Where the Wild Things Are
Now I Know I Know I've Lost You
Fairytale Of Philadelphia
Paint The Town Red
The Wind Brings Winter
Why Don't You Tell the Truth ?
Flyin' Pass (Single)
Still Not Fade the Footprints
Wrap Me U

In [7]:
songs = []
artists = []

for li in soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div "):
    for line in li.select('cite.title'):
        songs.append(line.get_text())
    for line in li.select('em.artist'):
        artists.append(line.get_text())

In [8]:
# each list becomes a column
top_chart_songs = pd.DataFrame({"song": songs, "artist": artists})
top_chart_songs

Unnamed: 0,song,artist
0,Lovin On Me,Jack Harlow
1,Lil Boo Thang,Paul Russell
2,I Remember Everything (feat. Kacey Musgraves),Zach Bryan
3,White Horse,Chris Stapleton
4,Need A Favor,Jelly Roll
...,...,...
95,Last Christmas (Single Version),Wham!
96,vampire,Olivia Rodrigo
97,White Christmas,Bing Crosby
98,World on Fire,Nate Smith


In [9]:
# Get User Input
user_input_song = input("Enter a song: ")

Enter a song: g6


In [10]:
# Convert the user input and song names to lowercase for case-insensitive comparison
user_input_song_lower = user_input_song.lower()
top_chart_songs_lower = top_chart_songs.apply(lambda x: x.str.lower())

In [11]:
# Check if the user-inputted song is in the top chart songs list (case-insensitive)
if user_input_song_lower in top_chart_songs_lower['song'].values:
    # Filter out the user-inputted song from the DataFrame
    top_chart_songs_without_input = top_chart_songs[
        top_chart_songs_lower['song'] != user_input_song_lower
    ]

    # Recommend a random song from the filtered DataFrame
    recommended_song = top_chart_songs_without_input.sample().squeeze()

    print(f"You might also like: {recommended_song['song']} by {recommended_song['artist']}")
else:
    print("We can't provide a recommendation right now.")

You might also like: Anti-Hero by Taylor Swift


# Lab | Web Scraping Multiple Pages


### Instructions Part 1
Expand the project
If you're done, you can try to expand the project on your own. Here are a few suggestions:

- Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
- Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.
- Wikipedia maintains a large collection of lists of songs: https://en.wikipedia.org/wiki/Lists_of_songs

In [12]:
url = "https://www.rollingstone.com/music/music-lists/best-songs-of-all-time-1224767/aretha-franklin-respect-2-1225337/"

In [13]:
response = requests.get(url)
response.status_code

200

In [14]:
soup = BeautifulSoup(response.content, "html.parser")

In [15]:
#soup.head()

In [16]:
#[<meta charset="utf-8"/>,
 #<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>,
 #<meta content="#ffffff" name="theme-color"/>,
 #<meta content="width=device-width, initial-scale=1.0" name="viewport">
 #<!--
 #		 _     _ _           ____          _          _____ _    ___
 #		| |   (_) | _____   / ___|___   __| | ___    | ____| |__|__ \
 #		| |   | | |/ / _ \ | |   / _ \ / _` |/ _ \   |  _| | '_ \ / /
 #		| |___| |   <  __/ | |__| (_) | (_| |  __/_  | |___| | | |_|
 #		|_____|_|_|\_\___|  \____\___/ \__,_|\___( ) |_____|_| |_(_)
 #												  |/


In [17]:
# prestab = soup.select("h2")
# prestab

In [18]:
# Print titles of the songs
for h2 in soup.find_all("h2"):
    print(h2.get_text())

Daddy Yankee, ‘Gasolina’
Lauryn Hill, ‘Doo Wop (That Thing)’
Radiohead, ‘Idioteque’
Elton John, ‘Tiny Dancer’
M.I.A., ‘Paper Planes’
Kendrick Lamar, ‘Alright’
Michael Jackson, ‘Billie Jean’
The Temptations, ‘My Girl’
Bob Marley and the Wailers, ‘Redemption Song’
Joy Division, ‘Love Will Tear Us Apart’
The Jimi Hendrix Experience, ‘All Along the Watchtower’
Outkast, ‘B.O.B.’
Otis Redding, ‘(Sittin’ On) the Dock of the Bay’
Prince and the Revolution, ‘When Doves Cry’
The White Stripes, ‘Seven Nation Army’
Little Richard, ‘Tutti-Frutti’
James Brown, ‘Papa’s Got a Brand New Bag’
Chuck Berry, ‘Johnny B. Goode’
Notorious B.I.G., ‘Juicy’
The Rolling Stones, ‘(I Can’t Get No) Satisfaction’
Lorde, ‘Royals’
Dr. Dre feat. Snoop Doggy Dogg, ‘Nuthin’ but a ‘G’ Thang’
Talking Heads, ‘Once in a Lifetime’
Bruce Springsteen, ‘Born to Run’
Joni Mitchell, ‘A Case of You’
Kanye West feat. Pusha T, ‘Runaway’
The Beatles, ‘A Day in the Life’
David Bowie, ‘Heroes’
The Ronettes, ‘Be My Baby’
Billie Holiday, ‘

In [19]:
import re

artists = []
songs = []

# This loop iterates over each h2 element in the HTML soup, excluding the last three elements.
for h2 in soup.find_all("h2")[:-3]:
    title_parts = h2.get_text().split(', ')

    # checks if the title_parts list has two elements (artist and song)
    if len(title_parts) == 2:
        artist, song = title_parts
        artists.append(artist)
        
        # Use regex to remove non-alphanumeric characters
        cleaned_song = re.sub(r'^\W+|\W+$', '', song)
        songs.append(cleaned_song)

# Print the lists
print("Artists:")
print(artists)

print("\nSongs:")
print(songs)


Artists:
['Daddy Yankee', 'Lauryn Hill', 'Radiohead', 'Elton John', 'M.I.A.', 'Kendrick Lamar', 'Michael Jackson', 'The Temptations', 'Bob Marley and the Wailers', 'Joy Division', 'The Jimi Hendrix Experience', 'Outkast', 'Otis Redding', 'Prince and the Revolution', 'The White Stripes', 'Little Richard', 'James Brown', 'Chuck Berry', 'Notorious B.I.G.', 'The Rolling Stones', 'Lorde', 'Dr. Dre feat. Snoop Doggy Dogg', 'Talking Heads', 'Bruce Springsteen', 'Joni Mitchell', 'Kanye West feat. Pusha T', 'The Beatles', 'David Bowie', 'The Ronettes', 'Billie Holiday', 'Robyn', 'John Lennon', 'Prince and the Revolution', 'Queen', 'Beyoncé feat. Jay-Z', 'The Beatles', 'The Kinks', 'The Rolling Stones', 'Stevie Wonder', 'The Beach Boys', 'Outkast', 'Fleetwood Mac', 'Missy Elliott', 'The Beatles', 'Marvin Gaye', 'Nirvana', 'Bob Dylan', 'Sam Cooke', 'Public Enemy', 'Aretha Franklin']

Songs:
['Gasolina', 'Doo Wop (That Thing', 'Idioteque', 'Tiny Dancer', 'Paper Planes', 'Alright', 'Billie Jean', '

In [20]:
# each list becomes a column
top_chart_songs2 = pd.DataFrame({"song": songs, "artist": artists})
top_chart_songs2

Unnamed: 0,song,artist
0,Gasolina,Daddy Yankee
1,Doo Wop (That Thing,Lauryn Hill
2,Idioteque,Radiohead
3,Tiny Dancer,Elton John
4,Paper Planes,M.I.A.
5,Alright,Kendrick Lamar
6,Billie Jean,Michael Jackson
7,My Girl,The Temptations
8,Redemption Song,Bob Marley and the Wailers
9,Love Will Tear Us Apart,Joy Division


In [21]:
# Concatenate
top_songs = pd.concat([top_chart_songs, top_chart_songs2])
top_songs = top_songs.reset_index(drop=True)
top_songs


Unnamed: 0,song,artist
0,Lovin On Me,Jack Harlow
1,Lil Boo Thang,Paul Russell
2,I Remember Everything (feat. Kacey Musgraves),Zach Bryan
3,White Horse,Chris Stapleton
4,Need A Favor,Jelly Roll
...,...,...
145,Smells Like Teen Spirit,Nirvana
146,Like a Rolling Stone,Bob Dylan
147,A Change Is Gonna Come,Sam Cooke
148,Fight the Power,Public Enemy


In [22]:
top_songs = top_songs.to_csv('top_songs.csv', index=False)

In [23]:
top_songs = pd.read_csv('top_songs.csv')

In [24]:
def remove_special_characters(text):
    # Remove special characters using a regular expression
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)


def recommend_song(top_songs):
    # Get User Input
    user_input_song = input("Enter a song: ")
    
    # Convert the user input and song names to lowercase for case-insensitive comparison
    user_input_song_lower = user_input_song.lower()
    top_songs_lower = top_songs.apply(lambda x: x.str.lower().apply(remove_special_characters))
    
    # Check if the user-inputted song is in the list (case-insensitive)
    if user_input_song_lower in top_songs_lower['song'].values:
        # Filter out the user-inputted song from the DataFrame
        top_songs_without_input = top_songs[
            top_songs_lower['song'] != user_input_song_lower
        ]

        # Recommend a random song from the filtered df
        recommended_song = top_songs_without_input.sample().squeeze()

        print(f"You might also like: {recommended_song['song']} by {recommended_song['artist']}")
    else:
        print("We can't provide a recommendation right now.")


In [25]:
recommend_song(top_songs)

Enter a song: g6
You might also like: Weather Forecast Today Rain by The day you


In [26]:
# url = "https://en.wikipedia.org/wiki/List_of_most-streamed_songs_in_the_United_Kingdom"
# response = requests.get(url)
# soup = BeautifulSoup(response.content, "html.parser")
# table = soup.find('table', class_='wikitable')
# print(table)
# songs = []
# uk_artists = []
# row_count = 0
# for row in table.find_all('tr')[1:]:
#     columns = row.find_all(['td','th'])
#     songs.append(columns[1].text.strip())
#     uk_artists.append(columns[2].text.strip())
    
#     row_count+=1
#     if row_count >= 48 :
#         break

# uk_songs = pd.DataFrame({'Title':songs, 'Artists':uk_artists})
# uk_songs