## Lab | Web Scraping Single Page (GNOD part 1)
Scrape the current top 100 songs and their respective artists, and put the information into a pandas dataframe.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
import re

### find url and store it in a variable

In [2]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

### download html with a get request

In [3]:
response = requests.get(url)

In [4]:
response.status_code # 200 status code means OK!

200

In [5]:
response.content

b'<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title>iTunes Top 100 Songs Chart 2024</title><meta name="viewport" content="width=device-width, initial-scale=1"><meta name="description" content="iTunes top 100 songs chart list. The most popular hit music and trending songs of 2024. Chart of today\'s current iTunes top 100 songs is updated daily."><meta property="og:title" content="iTunes Top 100 Songs Chart 2024"/><meta property="og:description" content="Chart of the top 100 songs on iTunes. Chart list of the top 100 song downloads of 2024 is updated daily."/><meta property="og:type" content="article"/><meta property="og:image" content="https://www.popvortex.com/images/logo-facebook.png"/><meta property="og:site_name" content="PopVortex"/><meta property="og:url" content="https://www.popvortex.com/music/charts/top-100-songs.php"/><meta property="fb:admins" content="100000239962942"/><meta property="fb:app_id" content="178831188827052"/><link rel="shortcut icon" href="/favi

### create a soup (and check the output if it is a html file)

In [6]:
soup = BeautifulSoup(response.content, "html.parser")
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><title>iTunes Top 100 Songs Chart 2024</title><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="iTunes top 100 songs chart list. The most popular hit music and trending songs of 2024. Chart of today's current iTunes top 100 songs is updated daily." name="description"/><meta content="iTunes Top 100 Songs Chart 2024" property="og:title"><meta content="Chart of the top 100 songs on iTunes. Chart list of the top 100 song downloads of 2024 is updated daily." property="og:description"><meta content="article" property="og:type"><meta content="https://www.popvortex.com/images/logo-facebook.png" property="og:image"/><meta content="PopVortex" property="og:site_name"/><meta content="https://www.popvortex.com/music/charts/top-100-songs.php" property="og:url"/><meta content="100000239962942" property="fb:admins"/><meta content="178831188827052" property="fb:app_id"/><link href="/favicon.png" rel="shortcut

### retrieve/extract the desired info

In [7]:
# Creating a soup with titles

In [8]:
len(soup.select('div.chart-content.col-xs-12.col-sm-8 > p > cite'))

100

In [9]:
# list with titles

title = []  


for li in soup.select('div.chart-content.col-xs-12.col-sm-8 > p > cite'):
    title.append(li.get_text())  
print(title)

["TEXAS HOLD 'EM", 'Lose Control', 'Beautiful Things', "TEXAS HOLD 'EM", 'dont let me go', 'Lovin On Me', 'Flowers', "Don't Let the Old Man In", 'Selfish', 'I Remember Everything (feat. Kacey Musgraves)', 'Turn the Lights Back On', 'Beautiful Messes', 'Where the Wild Things Are', 'Made For Me', 'Fast Car', 'Save Me', '16 CARRIAGES', 'Houdini', 'Lil Boo Thang', 'Live Like You Were Dying', 'Powerful Women', 'Need a Favor', 'Fast Car', 'Spin You Around (1/24)', 'Save Me (with Lainey Wilson)', 'Spicy Margarita', 'Cruel Summer', 'Training Season', 'Yeah! (feat. Lil Jon & Ludacris)', 'Good Day', 'White Horse', 'Three Little Birds', "Let's Go", 'Wildflowers and Wild Horses (Single Version)', 'Thinkin’ Bout Me', "Should've Been a Cowboy", 'Murder On the Dancefloor (Radio Edit)', 'What Was I Made For? (From The Motion Picture "Barbie")', 'Stick Season', 'The Door', 'Pretty Little Poison', 'TRUCK BED', "Nothing's Gonna Stop Us Now", 'Standing Next to You', 'Lead Me Home', 'Devil You Know', 'Stan

In [10]:
len(title)

100

In [11]:
# 1. this was copied selector from the website:

#chart-position-4 > div.chart-content.col-xs-12.col-sm-8 > p > em

# 2. "chart-position-" was reapeating with each number
# 3. we delete "chart-position-" from the soup selection

In [12]:
len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em"))

100

In [13]:
artist = []  


for li in soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em"):
    artist.append(li.get_text())  
print(artist)

['Beyoncé', 'Teddy Swims', 'Benson Boone', 'Beyoncé', 'mgk', 'Jack Harlow', 'Miley Cyrus', 'Toby Keith', 'Justin Timberlake', 'Zach Bryan', 'Billy Joel', 'Hillary Scott & The Scott Family', 'Luke Combs', 'Muni Long', 'Luke Combs', 'Jelly Roll', 'Beyoncé', 'Dua Lipa', 'Paul Russell', 'Tim McGraw', 'Pitbull & Dolly Parton', 'Jelly Roll', 'Tracy Chapman', 'Morgan Wallen', 'Jelly Roll', 'Jason Derulo & Michael Bublé', 'Taylor Swift', 'Dua Lipa', 'USHER', 'Forrest Frank', 'Chris Stapleton', 'Bob Marley & The Wailers', 'Key Glock', 'Lainey Wilson', 'Morgan Wallen', 'Toby Keith', 'Sophie Ellis-Bextor', 'Billie Eilish', 'Noah Kahan', 'Teddy Swims', 'Warren Zeiders', 'HARDY', 'Starship', 'Jung Kook', 'Jamey Johnson', 'Tyler Braden', 'Jung Kook & USHER', 'Tyla', 'Chris Stapleton', 'Morgan Wallen', 'Stephen Sanchez', 'Disturbed', 'Dua Lipa', 'The Red Clay Strays', 'Jung Kook & USHER', 'Chayce Beckham', 'Lee Brice', 'Hank Williams, Jr.', 'YG Marley', 'Jung Kook', 'Taylor Swift', 'Jung Kook', 'Jung

In [14]:
len(artist)

100

In [15]:
top_100_songs = pd.DataFrame({"title":title,
                              "artist":artist
                              })

In [16]:
top_100_songs

Unnamed: 0,title,artist
0,TEXAS HOLD 'EM,Beyoncé
1,Lose Control,Teddy Swims
2,Beautiful Things,Benson Boone
3,TEXAS HOLD 'EM,Beyoncé
4,dont let me go,mgk
...,...,...
95,Can't Get Enough,Jennifer Lopez
96,Unstoppable,Sia
97,Until I Found You (Em Beihold Version),Stephen Sanchez & Em Beihold
98,greedy,Tate McRae


## GNOD part 2

STEPS 

1. Preparing database with hot songs: top_100_songs
    - checking the length of the input (if shorter than 3, then the user needs to correct the input)
    - all titles lowercase
    - all artists lowercase 

2. Optimalisation of the user_input
    - lowercase user_input
    - whitestrip
    - what if not the whole word was given? Checking only part of the matching
    - 
4. Checking matching if the song is in the database
    - Yes: Give another recommendation
    - No: No recommendation

#### 1. hot songs database (preparing for further steps)

In [17]:
top_100_songs["title"]

0                             TEXAS HOLD 'EM
1                               Lose Control
2                           Beautiful Things
3                             TEXAS HOLD 'EM
4                             dont let me go
                       ...                  
95                          Can't Get Enough
96                               Unstoppable
97    Until I Found You (Em Beihold Version)
98                                    greedy
99                        Don't Stop Praying
Name: title, Length: 100, dtype: object

In [18]:
def clean_whitestripe_lowcase(row):
    # Function to clean whitestrips and make values to lowercase
    row = row.lower()
    row = row.strip()    
    return row

In [19]:
top_100_songs["title"] = top_100_songs["title"].apply(clean_whitestripe_lowcase)
top_100_songs["artist"] = top_100_songs["artist"].apply(clean_whitestripe_lowcase)

In [32]:
top_100_songs.tail()

Unnamed: 0,title,artist
95,can't get enough,jennifer lopez
96,unstoppable,sia
97,until i found you (em beihold version),stephen sanchez & em beihold
98,greedy,tate mcrae
99,don't stop praying,matthew west


In [21]:
top_100_songs.dtypes

title     object
artist    object
dtype: object

In [22]:
top_100_songs.to_csv("top_100_songs.csv", index=False)

#### 2. Optimalisation of the user_input

In [23]:
def clean_user_input(user_input):
    # Function to clean whitestrips and make values to lowercase
    user_input = user_input.lower()
    user_input = user_input.strip()
    return user_input

In [28]:
user_input = input("Please give a name of your favourite song: ")

Please give a name of your favourite song: lose


In [29]:
while len(user_input) <= 3:
    print("Too short, put again")
    user_input = input("Please give a name of your favourite song: ")
if len(user_input)>3:
    user_input = clean_user_input(user_input)
    print(user_input)

lose


#### 3. Checking if currently hot

In [30]:
user_input

'lose'

In [31]:
result = top_100_songs["title"].apply(lambda x: user_input in x)

if (result == False).all() != True:
    print("new song recommendation for you: ")
    print(top_100_songs["title"][randint(0,99)]) 
else:
    print("No reccomendation at the moment")

new song recommendation for you: 
should've been a cowboy
