# Eurovision's data Scraping
#### This code is used to collect the eurovision data and save it to CSV format for further analysis.

## Importing Libraries

In [None]:
import grequests
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm, trange

import warnings
warnings.simplefilter("ignore", category=UserWarning)

## Supporting Functions

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0'}

In [None]:
# The functionb scraps the events metadata and Url's

def get_events(path):     
    pages = range(5)     # 5 pages (0-4)
    events = []
    url = []

    for page in pages:
        search = {'search': '', 'page': page}
        response = requests.get(path + '/history', params=search, headers=headers)
        tmp_events = pd.read_html(response.content,flavor='lxml')[0]
        events.append(tmp_events)
        
        soup = BeautifulSoup(response.text,'lxml')
        for link in soup.find_all('a', string='More'):
            url.append(path + link.get('href'))
    
    events = pd.concat(events)
    events.columns = ['Year','City','Winner','Participant','Song','Points','Url']
    events['Url'] = url
    
    return events

In [None]:
# The function determines the contest's format for a certain year - the function is used by the get_contests function

def event_format(year):
    if year in range(1956,2004):             # between 1956 and 2003
        return ['final']
    elif year in range(2004,2008):           # between 2004 and 2007 
        return ['grand-final','semi-final']
    else:                                    # from 2008
        return ['first-semi-final','second-semi-final','grand-final']

In [None]:
# The function scraps the contests metadata.

def get_contests(events):
    rows = []
    
    with trange(len(events)) as bar:
        for index, event in events.iterrows():
            bar.desc = str(event['Year']) + '\t'     # Progress-bar
            
            contests_format = event_format(event['Year'])
            for contest_format in contests_format:
                path = event['Url'] + '/' + contest_format
                contest_data = pd.read_html(path, flavor='lxml')[0]
                contest_data['Year'] = event['Year']
                contest_data['Format'] = contest_format
                rows.append(contest_data)
            bar.update(1)     # Progress-bar
            
    contests = pd.concat(rows)
    return contests

In [None]:
# The function scraps the points given by jury members and televoters on the given contests.

def get_scores(contests_urls):
    rows = []
    responses = None
    batches = 20

    # Before 2016 there are officially published scores only for the juries.   
    # From 2016 there are officially published scores for juries and televoters.
    url_juries = contests_urls[contests_urls['Year']<2016]['Url']
    url_juries_and_televoters = contests_urls[contests_urls['Year']>=2016]['Url']
        
    with trange(len(url_juries)) as bar:   
        bar.desc = 'Working ... Please hold...\t'    # Progress-bar    
        url_to_get = []
        for url in url_juries:
            url_to_get.append(grequests.get(url, headers=headers))
            bar.update(1)     # Progress-bar update
        responses = grequests.map(url_to_get, size=batches)
    
    with trange(len(url_juries)) as bar:   
        bar.desc = 'Scraping contests with only Juries scores\t'     # Progress-bar
        for response in responses:
            soup = BeautifulSoup(response.content,'lxml')
            jury_scores = (soup.find_all(attrs={'class':'views-element-container'})[0]).get_text("|", strip=True).split(sep='|')[1:-2]
            rows.append(dict(Url = response.url, Jury_scores = jury_scores, Televoters_scores = []))
            bar.update(1)     # Progress-bar update
        
        
    with trange(len(url_juries_and_televoters)) as bar:        
        bar.desc = 'Working ... Please hold...\t'     # Progress-bar
        url_to_get = []
        for url in url_juries_and_televoters:
            url_to_get.append(grequests.get(url, headers=headers))
            bar.update(1)     # Progress-bar update
        responses = grequests.map(url_to_get, size=batches) 

    with trange(len(url_juries_and_televoters)) as bar:            
        bar.desc = 'Scraping contests with Juries and Televoters scores\t'     # Progress-bar
        for response in responses:
            soup = BeautifulSoup(response.content,'lxml')
            televoters_scores = (soup.find_all(attrs={'class':'views-element-container'})[0]).get_text("|", strip=True).split(sep='|')[1:-2]
            jury_scores = (soup.find_all(attrs={'class':'views-element-container'})[1]).get_text("|", strip=True).split(sep='|')[1:-2]
            rows.append(dict(Url = response.url, Jury_scores = jury_scores, Televoters_scores = televoters_scores))   
            bar.update(1)      # Progress-bar update
    
    return pd.DataFrame(rows)

In [None]:
# The function turns the scores scraping data to a new DataFrame with the data arranged

def extract_score(scores_data):
    rows = []
    
    for i,row in scores_data.iterrows():
        jury = row['Jury_scores']
        for j in jury:
            if j.isdigit():
                score = j
            else:
                rows.append({'Year':row['Year'], 'Format': row['Format'], 'Country': row['Country'],
                             'From': j, 'Score_type':'Jury', 'Score':score })
                
    for i,row in scores_data.iterrows():
        jury_tele = row['Televoters_scores']
        for jt in jury_tele:
            if jt.isdigit():
                score = jt
            else:
                rows.append({'Year':row['Year'], 'Format': row['Format'], 'Country': row['Country'],
                             'From': jt, 'Score_type':'Televoters', 'Score':score })               
                
    return pd.DataFrame(rows)

## Extracting the data

### Getting 'Events' data
Scraping all the Eurovision events over the years metadata.

In [None]:
df_events = get_events('https://eurovision.tv')
df_events.head()

In [None]:
df_events.to_csv('Eurovision events.csv',index=False)

### Getting 'Contests' data
Scraping the contests metadata including the inner contests in each event (semi-final, first & second finals, grand-final)

In [None]:
events = df_events.groupby(by=['Year','Url']).count().reset_index()[['Year','Url']]
df_contests = get_contests(events)
df_contests.head()

In [None]:
df_contests.to_csv('Eurovision contests.csv',index=False)

### Getting 'Score' data
Scraping the Televoter's and Jury's scores in each contest

In [None]:
events = df_events[df_events['Year']!=1956].groupby(by=['Year','Url']).count().reset_index()[['Year','Url']]
contests = events.join(df_contests[['Year','Format','Country']].set_index('Year'),on='Year')

replacements = {'united kingdom':'united-kingdom','bosnia & herzegovina':'bosnia-herzegovina','serbia & montenegro':'serbia-montenegro',
                'north macedonia':'north-macedonia','czech republic':'czech-republic','san marino':'san-marino','türkiye':'turkiye'}
contests['Url_country'] = (contests['Country'].str.lower()).replace(replacements)

contests['Url'] = contests['Url'] + '/' + contests['Format'] + '/results/' + contests['Url_country']

In [None]:
data = get_scores(contests[['Year','Url']])
contests = contests.set_index('Url').join(data.set_index('Url'))

In [None]:
df_scores = pd.DataFrame(extract_score(contests))
df_scores.head()

In [None]:
df_scores.to_csv('Eurovision scores.csv',index=False)