In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:


# Set up Selenium headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

data = []
def scrape_page(year):
    if year == 2020:
        return
    url = "https://eurovisionworld.com/eurovision/" + str(year)
    driver.get(url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find the main winners table
    table = soup.find("table", class_="v_table")

    # Loop through each row, skip malformed or empty ones
    for row in table.find_all("tr"):
        cols = row.find_all("td")
        
        # Skip rows that don't have enough data
        if len(cols) < 4:
            continue

        # Parse year and skip 2020 (canceled year)
        

        country = cols[1].get_text(strip=True)

        # Handle song and artist (may be combined or malformed)
        song_artist_cell = cols[2]
        song = ""
        artist = ""

        # Some song cells are split into text + <span class="v_artist"> tag
        if song_artist_cell:
            parts = song_artist_cell.get_text(separator="|", strip=True).split("|")
            if len(parts) >= 2:
                song = parts[0]
                artist = parts[1]
            elif len(parts) == 1:
                song = parts[0]

        points = cols[3].get_text(strip=True)

        data.append([year, country, song, artist, points])
        print([year, country, song, artist, points])

for year in range(2024, 1955, -1):
    scrape_page(year)

driver.quit()
# Create DataFrame
df = pd.DataFrame(data, columns=["Year", "Country", "Song", "Artist", "Points"])
print(df.head())

# # Save to CSV
# df.to_csv("all_songs.csv", index=False)


[2024, 'Switzerland', 'The Code', 'Nemo', '591']
[2024, 'Croatia', 'Rim Tim Tagi Dim', 'Baby Lasagna', '547']
[2024, 'Ukraine', 'Teresa & Maria', 'alyona alyona & Jerry Heil', '453']
[2024, 'France', 'Mon amour', 'Slimane', '445']
[2024, 'Israel', 'Hurricane', 'Eden Golan', '375']
[2024, 'Ireland', 'Doomsday Blue', 'Bambie Thug', '278']
[2024, 'Italy', 'La noia', 'Angelina Mango', '268']
[2024, 'Armenia', 'Jako', 'Ladaniva', '183']
[2024, 'Sweden', 'Unforgettable', 'Marcus & Martinus', '174']
[2024, 'Portugal', 'Grito', 'Iolanda', '152']
[2024, 'Greece', 'Zari', 'Marina Satti', '126']
[2024, 'Germany', 'Always on the Run', 'Isaak', '117']
[2024, 'Luxembourg', 'Fighter', 'Tali', '103']
[2024, 'Lithuania', 'Luktelk', 'Silvester Belt', '90']
[2024, 'Cyprus', 'Liar', 'Silia Kapsis', '78']
[2024, 'Latvia', 'Hollow', 'Dons', '64']
[2024, 'Serbia', 'Ramonda', 'Teya Dora', '54']
[2024, 'United KingdomUK', 'Dizzy', 'Olly Alexander', '46']
[2024, 'Finland', 'No Rules!', 'Windows95man', '38']
[20

In [3]:
df.to_csv('all_songs.csv')

In [4]:
df

Unnamed: 0,Year,Country,Song,Artist,Points
0,2024,Switzerland,The Code,Nemo,591
1,2024,Croatia,Rim Tim Tagi Dim,Baby Lasagna,547
2,2024,Ukraine,Teresa & Maria,alyona alyona & Jerry Heil,453
3,2024,France,Mon amour,Slimane,445
4,2024,Israel,Hurricane,Eden Golan,375
...,...,...,...,...,...
1419,1956,Belgium,Le Plus Beau Jour De Ma Vie,Mony Marc,–
1420,1956,Germany,So Geht Das Jede Nacht,Freddy Quinn,–
1421,1956,France,Il Est Là,Dany Dauberson,–
1422,1956,Luxembourg,Les Amants De Minuit,Michèle Arnaud,–


In [2]:
df = pd.read_csv('all_songs.csv')

In [3]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

def scrape_lyrics(year, country):
    try:
        url = f'https://eurovisionworld.com/eurovision/{year}/{country}'
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        data = soup.find('div', attrs={'class': 'lyrics_div', 'data-lyrics-version': 'English'})
        para = data.find_all('p')
        lyrics = '\n'.join([re.sub(r'<p[^>]*>', '', str(i).replace('<br/>', '\n').replace('</p>', '')) for i in para])
        return lyrics
    except:
        return 'none'

    # request = requests.get(url)
    # data = bs4.BeautifulSoup(request.text, 'html.parser')
    # l = data.find_all('div', attrs={'class':'top'}) #, 'data-lyrics-version': 'English'})
    # print(l)


lyrics_list = []
for i in range(len(df)):
    year = df.iloc[i]['Year']
    country = df.iloc[i]['Country']
    print(f'{year} {country}')
    lyrics_list.append(scrape_lyrics(year, country))

driver.quit()

print(lyrics_list)

df['Lyrics'] = pd.Series(lyrics_list)
print(df.head())
df.to_csv('all_songs.csv')

2024 Switzerland
2024 Croatia
2024 Ukraine
2024 France
2024 Israel
2024 Ireland
2024 Italy
2024 Armenia
2024 Sweden
2024 Portugal
2024 Greece
2024 Germany
2024 Luxembourg
2024 Lithuania
2024 Cyprus
2024 Latvia
2024 Serbia
2024 United KingdomUK
2024 Finland
2024 Estonia
2024 Georgia
2024 Spain
2024 Slovenia
2024 Austria
2024 Norway
2023 Sweden
2023 Finland
2023 Israel
2023 Italy
2023 Norway
2023 Ukraine
2023 Belgium
2023 Estonia
2023 Australia
2023 Czechia
2023 Lithuania
2023 Cyprus
2023 Croatia
2023 Armenia
2023 Austria
2023 France
2023 Spain
2023 Moldova
2023 Poland
2023 Switzerland
2023 Slovenia
2023 Albania
2023 Portugal
2023 Serbia
2023 United KingdomUK
2023 Germany
2022 Ukraine
2022 United KingdomUK
2022 Spain
2022 Sweden
2022 Serbia
2022 Italy
2022 Moldova
2022 Greece
2022 Portugal
2022 Norway
2022 Netherlands
2022 Poland
2022 Estonia
2022 Lithuania
2022 Australia
2022 Azerbaijan
2022 Switzerland
2022 Romania
2022 Belgium
2022 Armenia
2022 Finland
2022 Czechia
2022 Iceland
2022 F