# FIFA World Rankings
## Part 1 - Data Acquisition
- Using Python's Selenium package to perform web scraping
___

#### Import necessary dependencies

In [1]:
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import json
from datetime import datetime, date, timedelta
import numpy as np
import pandas as pd
from datetime import datetime as dt
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import validators

pd.options.display.max_rows = 5000

home_page = 'https://www.fifa.com/fifa-world-ranking/ranking-table/men/'

# Set wait times
waittime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options) 

___
#### Setup web driver

In [2]:
# Get driver to retrieve homepage
driver.get(home_page)

# Wait for page to load
driver.implicitly_wait(waittime)

# Handle cookies acceptance if any
try:
    driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]').click()
except:
    pass

___
#### Get list of ranking dates and corresponding URLs

In [3]:
# Get list of dates where rankings are available
date_elems = driver.find_elements_by_xpath("//li[@class='fi-ranking-schedule__nav__item']/a")

# Create empty date list to store all dates with rankings
date_list = []

for elem in date_elems:
    date_dict = {}
    date = elem.get_attribute("text")
    url = elem.get_attribute("href")
    date_dict.update({'date': date, 'url': url}) 
    date_list.append(date_dict)

date_df = pd.DataFrame(date_list) 
date_df

Unnamed: 0,date,url
0,27 May 2021,https://www.fifa.com/fifa-world-ranking/rankin...
1,07 April 2021,https://www.fifa.com/fifa-world-ranking/rankin...
2,18 February 2021,https://www.fifa.com/fifa-world-ranking/rankin...
3,10 December 2020,https://www.fifa.com/fifa-world-ranking/rankin...
4,26 November 2020,https://www.fifa.com/fifa-world-ranking/rankin...
5,22 October 2020,https://www.fifa.com/fifa-world-ranking/rankin...
6,17 September 2020,https://www.fifa.com/fifa-world-ranking/rankin...
7,16 July 2020,https://www.fifa.com/fifa-world-ranking/rankin...
8,11 June 2020,https://www.fifa.com/fifa-world-ranking/rankin...
9,09 April 2020,https://www.fifa.com/fifa-world-ranking/rankin...


#### Define utility functions

In [4]:
# Get pagination pages of each date page
def get_pagination_pages():
    pagination_elems = driver.find_elements_by_xpath("//ul[@class='pagination']/li/a") 
    pagination_list = []

    for elem in pagination_elems:
        page_num = elem.get_attribute("text")
        pagination_list.append(page_num)

    return pagination_list

In [5]:
# Define function to scrape current page
def scrape_current_page(page_num, date):
    df = pd.DataFrame(columns = ['rank', 'country', 'points', 'date'])
    driver.find_element_by_link_text(page_num).click()
    row_elems = driver.find_elements_by_xpath("//tbody/tr")

    for elem in row_elems:
        rank = elem.text.split('\n')[0]
        country = elem.text.split('\n')[1]
        points = elem.text.split('\n')[2].split(' ')[0]
        df = df.append({'rank':rank, 'country':country, 'points':points, 'date':date}, ignore_index=True)
    
    return df

___
#### Perform iterative web scraping on every page

In [6]:
# Get list of dates where rankings are available
date_elems = driver.find_elements_by_xpath("//li[@class='fi-ranking-schedule__nav__item']/a")

date_list = []

for elem in date_elems:
    date_dict = {}
    date = elem.get_attribute("text")
    url = elem.get_attribute("href")
    date_dict.update({'date': date, 'url': url}) 
    date_list.append(date_dict)

date_df = pd.DataFrame(date_list) 
date_df

Unnamed: 0,date,url
0,27 May 2021,https://www.fifa.com/fifa-world-ranking/rankin...
1,07 April 2021,https://www.fifa.com/fifa-world-ranking/rankin...
2,18 February 2021,https://www.fifa.com/fifa-world-ranking/rankin...
3,10 December 2020,https://www.fifa.com/fifa-world-ranking/rankin...
4,26 November 2020,https://www.fifa.com/fifa-world-ranking/rankin...
5,22 October 2020,https://www.fifa.com/fifa-world-ranking/rankin...
6,17 September 2020,https://www.fifa.com/fifa-world-ranking/rankin...
7,16 July 2020,https://www.fifa.com/fifa-world-ranking/rankin...
8,11 June 2020,https://www.fifa.com/fifa-world-ranking/rankin...
9,09 April 2020,https://www.fifa.com/fifa-world-ranking/rankin...


In [7]:
# Define master dataframe to store all the rankings
master_df = pd.DataFrame(columns = ['rank', 'country', 'points', 'date'])

for index, row in date_df.iterrows():
    date = row['date']
    url = row['url']
    
    driver.get(url)
    driver.implicitly_wait(waittime)
    
    # Handle cookies acceptance if any
    try:
        driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]').click()
    except:
        pass
    
    # Wait for pagination load
    element_present = EC.presence_of_element_located((By.ID, 'rank-table_paginate'))
    WebDriverWait(driver, waittime).until(element_present)
        
    # Scrape for every pagination page
    list_of_pages = get_pagination_pages()
    for page_num in list_of_pages:
        df_i = scrape_current_page(page_num, date)
        master_df = pd.concat([master_df, df_i])
        
master_df.reset_index(drop=True, inplace=True)

___
#### View output

In [10]:
len(master_df)

63064

In [59]:
master_df.head(20)

Unnamed: 0,rank,country,points,date
0,1,Belgium,1783.38,27 May 2021
1,2,France,1757.3,27 May 2021
2,3,Brazil,1742.65,27 May 2021
3,4,England,1686.78,27 May 2021
4,5,Portugal,1666.12,27 May 2021
5,6,Spain,1648.13,27 May 2021
6,7,Italy,1642.06,27 May 2021
7,8,Argentina,1641.95,27 May 2021
8,9,Uruguay,1639.08,27 May 2021
9,10,Denmark,1631.55,27 May 2021


#### Export data

In [58]:
# master_df.to_csv('FIFA_World_Rankings_Extracted_20210610.csv', index=False)
# master_df = pd.read_csv('FIFA_World_Rankings_Extracted_20210610.csv')