# Intro
In this notebook we will create a model to predict match results for the UEFA Euro 2021 championship. The data set consists of historic results from past matches and the FIFA rankings.

# Data collection
Historic results are already available as a dataset. FIFA rankings will be scraped off the web using code from [this GitHub project](https://github.com/cnc8/fifa-world-ranking).   

Let's start by importing some libraries.

In [1]:
import datetime
import asyncio
import aiohttp
import requests as r
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer

# for run asyncio in jupyter / https://github.com/jupyter/notebook/issues/3397
import nest_asyncio
nest_asyncio.apply()

In [2]:
date_id = 'id1'  
fifa_url = 'https://www.fifa.com/fifa-world-ranking/ranking-table/men/rank'

def get_dates_html():
    page_source = r.get(f'{fifa_url}/{date_id}/')
    page_source.raise_for_status()
    dates = BeautifulSoup(page_source.text, 
                          'html.parser', 
                          parse_only=SoupStrainer('li', attrs={'class': 'fi-ranking-schedule__nav__item'}))
    return dates


def create_dates_dataset(html_dates):
    date_ids = [li['data-value'] for li in html_dates]
    dates = [li.text.strip() for li in html_dates]
    dataset = pd.DataFrame(data={'date': dates, 'date_id': date_ids})
    
    # convert 'date' from str to datetime and sorting "old -> new"
    dataset['date'] = pd.to_datetime(dataset['date'], format='%d %B %Y')
    dataset.sort_values('date', ignore_index=True, inplace=True)
    assert dataset.date.min() == dataset.iloc[0].date, \
            "Incorrect dataset sorting"
    
    return dataset

dates_from_page = get_dates_html()
dates_dataset = create_dates_dataset(dates_from_page)

assert len(dates_from_page) == dates_dataset.shape[0], \
        "Number of dates in html and dataset don't match"

async def get_rank_page(date_id, session):
    async with session.get(f'{fifa_url}/{date_id}/') as response:
        page = await response.text()
        if response.status == 200:
            return {'page': page, 'id': date_id}
        else:
            print(f'Parse error, page: {response.url}')
            return False
        
        
def scrapy_rank_table(page, date):
    rows = BeautifulSoup(page, 
                          'html.parser', 
                          parse_only=SoupStrainer('tbody')).find_all('tr')
    table = []
    for row in rows:
        table.append({
            'id': int(row['data-team-id']), 
            'country_full': row.find('span', {'class': 'fi-t__nText'}).text, 
            'country_abrv': row.find('span', {'class': 'fi-t__nTri'}).text,
            'rank': int(row.find('td', {'class': 'fi-table__rank'}).text), 
            #'total_points': int(row.find('td', {'class': 'fi-table__points'}).text),
            'total_points': row.find('td', {'class': 'fi-table__points'}).text,
            #'previous_points': int(row.find('td', {'class': 'fi-table__prevpoints'}).text or 0),
            'previous_points': row.find('td', {'class': 'fi-table__prevpoints'}).text or 0,
            'rank_change': int(row.find('td', {'class': 'fi-table__rankingmovement'}).text.replace('-', '0')),
            'confederation': row.find('td', {'class': 'fi-table__confederation'}).text.strip('#'),
            'rank_date': date
        })
    return table
    

async def parse_ranks(pages_df):
    fifa_ranking = pd.DataFrame(columns=[
        'id', 'rank', 'country_full', 'country_abrv', 
        'total_points', 'previous_points', 'rank_change', 
        'confederation', 'rank_date'
    ])

    start_time = datetime.datetime.now()
    print("Start parsing.. ", datetime.datetime.now()-start_time)
    
    task_parse = []
    async with aiohttp.ClientSession() as session:
        for date_id in pages_df.date_id.to_list():
            task_parse += [asyncio.create_task(get_rank_page(date_id, session))]
    
        for task in asyncio.as_completed(task_parse):
            page = await task
            if not task:
                continue
            date_ranking = scrapy_rank_table(page['page'], 
                                             pages_df[pages_df.date_id == page['id']].date.iloc[0])
            fifa_ranking = fifa_ranking.append(date_ranking, ignore_index=True)

            if fifa_ranking.rank_date.nunique() % 50 == 0:
                print(f'Complite {fifa_ranking.rank_date.nunique()}/{pages_df.shape[0]} dates')
    
    fifa_ranking.sort_values('rank_date', ignore_index=True, inplace=True)
    print(f'Parsing complite. Time {datetime.datetime.now()-start_time}')
    return fifa_ranking


def data_correction(df):
    """ Handmade """
    # Lebanon has two abbreviations
    df.replace({'country_abrv': 'LIB'}, 'LBN', inplace=True)
    # Montenegro duplicates
    df.drop(df[df.id == 1903356].index, inplace=True)
    # North Macedonia has two full names
    df.replace({'country_full': 'FYR Macedonia'}, 'North Macedonia', inplace=True)
    # Cabo Verde has two full names
    df.replace({'country_full': 'Cape Verde Islands'}, 'Cabo Verde', inplace=True)
    # Saint Vincent and the Grenadines have two full names
    df.replace({'country_full': 'St. Vincent and the Grenadines'}, 'St. Vincent / Grenadines', inplace=True)
    # Swaziland has two full names
    df.replace({'country_full': 'Eswatini'}, 'Swaziland', inplace=True)
    # Curacao transform to Curaçao (with 'ç')
    df.replace({'country_full': 'Curacao'}, 'Curaçao', inplace=True)
    # São Tomé and Príncipe have three full names
    df.replace({'country_full': ['Sao Tome e Principe', 'São Tomé e Príncipe']}, 
               'São Tomé and Príncipe', inplace=True)
    return df


def check_data(ranks_df, dates_df):
    if ranks_df.rank_date.nunique() != dates_df.date.nunique():
        print("Warning! Numbers of rank dates don't match")
    if ranks_df.country_full.nunique() != ranks_df.country_abrv.nunique():
        print("Warning! Number of names and abbreviations does not match")
    if ranks_df.country_full.nunique() != ranks_df.id.nunique():
        print("Warning! Number of names and IDs does not match")
        

def save_as_csv(df):
    df.to_csv(
        f'fifa_ranking-{df.rank_date.max().date()}.csv',
        index=False, 
        encoding='utf-8'
    )
    print('Dataframe saved in currently folder')

HTTPError: 401 Client Error: Unauthorized for url: https://www.fifa.com/fifa-world-ranking/men

Now, get FIFA rankings using Beautiful Soup.

In [4]:
fifa_ranking_df = asyncio.run(parse_ranks(dates_dataset))
fifa_ranking_df = data_correction(fifa_ranking_df)
check_data(fifa_ranking_df, dates_dataset)
save_as_csv(fifa_ranking_df)

fifa_ranking_df.tail()

Start parsing..  0:00:00.000010
Complite 50/312 dates
Complite 100/312 dates
Complite 150/312 dates
Complite 200/312 dates
Complite 250/312 dates
Complite 300/312 dates
Parsing complite. Time 0:02:54.953653
Dataframe saved in currently folder


Unnamed: 0,id,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
63059,43829,139,Myanmar,MYA,1081.26,1081.26,0,AFC,2021-05-27
63060,43955,138,Latvia,LVA,1081.66,1081.66,0,UEFA,2021-05-27
63061,43886,137,Tanzania,TAN,1088.05,1088.05,0,CAF,2021-05-27
63062,43981,161,Tahiti,TAH,1014.27,1014.27,0,OFC,2021-05-27
63063,43942,4,England,ENG,1686.78,1686.78,0,UEFA,2021-05-27


## Define participants

In [5]:
teams = [['Italy', 'ITA', 'A'],
        ['Switzerland', 'SUI', 'A'],
        ['Turkey', 'TUR', 'A'],
        ['Wales', 'WAL', 'A'],
        ['Belgium', 'BEL', 'B'],
        ['Denmark', 'DEN', 'B'],
        ['Finland', 'FIN', 'B'],
        ['Russia', 'RUN', 'B'],
        ['Austria', 'AUT', 'C'],
        ['Netherlands', 'NED', 'C'],
        ['North Macedonia', 'MKD', 'C'],
        ['Ukraine', 'UKR', 'C'],
        ['Croatia', 'CRO', 'D'],
        ['Czech Republic', ' CZE', 'D'],
        ['England', 'ENG', 'D'],
        ['Scotland', 'SCO', 'D'],
        ['Poland', 'POL', 'E'],
        ['Slovakia', 'SVK', 'E'],
        ['Spain', 'ESP', 'E'],
        ['Sweden', 'SWE', 'E'],
        ['France', 'FRA', 'F'],
        ['Germany', 'GER', 'F'],
        ['Hungary', 'HUN', 'F'],
        ['Portugal', 'POR', 'F']]
teams_df = pd.DataFrame(teams, columns=['Country', 'Abrv', 'Group'])
teams_df.head(4)

Unnamed: 0,Country,Abrv,Group
0,Italy,ITA,A
1,Switzerland,SUI,A
2,Turkey,TUR,A
3,Wales,WAL,A


## Filter out relevant historic match data
There are several choices to be made here. Include only matches between participants, or use all matches with at least one participant country - here we go for the former. The FIFA ranking is from December 1992, so we will filter out matches before that date since ranking will be an important feature in the model we will create.

In [6]:
match_df = pd.read_csv('../input/international-football-results-from-1872-to-2017/results.csv')
# only matches from 1993
match_df = match_df[match_df.date > '1993-01-01']
# only matches between playing teams
match_df = match_df[match_df.home_team.isin(teams_df.Country.to_list())]
match_df = match_df[match_df.away_team.isin(teams_df.Country.to_list())]
match_df.reset_index(drop=True, inplace=True)
# drop city column
match_df.drop(labels='city', axis=1, inplace=True)
print('Number of matches after filtering: {}'.format(len(match_df)))

Number of matches after filtering: 1653


# Feature creation
Next we will add some features to the match data, which is FIFA ranking for the home and away teams. There are many other features we could add, such as historic weather data etc.   

For each match the latest rank for each team is added to the dataframe.

In [7]:
MATCHES = len(match_df)

home_rank = np.zeros(MATCHES, dtype=np.int)
away_rank = np.zeros(MATCHES, dtype=np.int)
home_total_points = np.zeros(MATCHES, dtype=np.float)
away_total_points = np.zeros(MATCHES, dtype=np.float)
for i in range(MATCHES):
    home_listing = fifa_ranking_df[((fifa_ranking_df.country_full == match_df.iloc[i].home_team) & 
                            (fifa_ranking_df.rank_date <= match_df.iloc[i].date))].sort_values(by='rank_date', ascending=False)
    
    try:
        home_rank[i] = int(home_listing.iloc[0]['rank'])
    except:
        home_rank[i] = 155
        
    away_listing = fifa_ranking_df[((fifa_ranking_df.country_full == match_df.iloc[i].away_team) & 
                            (fifa_ranking_df.rank_date <= match_df.iloc[i].date))].sort_values(by='rank_date', ascending=False)
        
    try:
        away_rank[i] = int(away_listing.iloc[0]['rank'])
    except:
        away_rank[i] = 155

Then we drop a few columns that will not be used.

In [8]:
match_df['home_rank'] = home_rank
match_df['away_rank'] = away_rank
match_df['friendly'] = (match_df.tournament == 'Friendly')
match_df.drop(labels=['tournament', 'date', 'country'], axis=1, inplace=True)
match_df.neutral = match_df.neutral.astype(int)
match_df.friendly = match_df.neutral.astype(int)

In [9]:
match_df.tail()

Unnamed: 0,home_team,away_team,home_score,away_score,neutral,home_rank,away_rank,friendly
1648,Netherlands,Scotland,2,2,1,16,44,1
1649,Italy,Czech Republic,4,0,0,7,40,0
1650,Spain,Portugal,0,0,0,6,5,0
1651,Austria,Slovakia,0,0,0,23,36,0
1652,Belgium,Croatia,1,0,0,1,14,0


## Convert categorical features to onehot
Country names are strings which are not suitable input to most models, so here we convert them to onehot format.

In [10]:
X = match_df[['home_team', 'away_team', 'neutral', 'home_rank', 'away_rank', 'friendly']]
y1 = match_df['home_score']
y2 = match_df['away_score']

onehot_columns = ['home_team', 'away_team']
onehot_df = X[onehot_columns]
onehot_df = pd.get_dummies(onehot_df, columns = onehot_columns)
match_onehot_drop = X.drop(onehot_columns, axis = 1)
match_onehot = pd.concat([match_onehot_drop, onehot_df], axis = 1)
match_onehot.head()

Unnamed: 0,neutral,home_rank,away_rank,friendly,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,7,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,33,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,23,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,34,19,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,9,15,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


# Train XGBoost model
We will train a XGBoost regressor model, or rather two of them, to predict match results. One model to predict the home team score and one model for the away team score. It is also possible to use [sklearn.multioutput.MultiOutputRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html) to combine the two models.

In [11]:
from xgboost import XGBRegressor

# home team score model
hmodel = XGBRegressor()
hmodel.fit(match_onehot.values, y1.values)
#away team score model
amodel = XGBRegressor()
amodel.fit(match_onehot.values, y2.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

# Predict
Now the fun part! Let's predict the match results for the first week's matches, starting tomorrow! We need to check careful where the matches are played, so we get the "neutral ground" feature set right, and we also put the home team first in the function below.

In [12]:
def predict(h_country, a_country, neutral=True):
    # create vector
    cols = ['neutral', 'home_rank', 'away_rank', 'friendly']
    for c in onehot_df.columns.to_list():
        cols.append(c)
    df = pd.DataFrame(np.zeros((1,len(cols)), dtype=int), columns=cols)
    if neutral:
        df.neutral.iloc[0] = 1
    else:
        df.neutral.iloc[0] = 0
    df.home_rank.iloc[0] = fifa_ranking_df[((fifa_ranking_df.rank_date == '2021-05-27') & (fifa_ranking_df.country_full == h_country))]['rank'].values[0]
    df.away_rank.iloc[0] = fifa_ranking_df[((fifa_ranking_df.rank_date == '2021-05-27') & (fifa_ranking_df.country_full == a_country))]['rank'].values[0]
    df['home_team_'+h_country].iloc[0] = 1
    df['away_team_'+a_country].iloc[0] = 1
    #df = df[hmodel.get_booster().feature_names]
    # predict
    hscore = int(hmodel.predict(df.iloc[0].to_numpy().reshape(1,52))[0])
    ascore = int(amodel.predict(df.iloc[0].to_numpy().reshape(1,52))[0])
    return hscore, ascore

In [13]:
predict('Italy', 'Turkey', False)

(1, 0)

In [14]:
predict('Wales', 'Switzerland')

(1, 1)

In [15]:
predict('Denmark', 'Finland', False)

(1, 0)

In [16]:
predict('Russia', 'Belgium', False)

(1, 3)

In [17]:
predict('England', 'Croatia', False)

(1, 1)

In [18]:
predict('Austria', 'North Macedonia')

(0, 0)

In [19]:
predict('Netherlands', 'Ukraine', False)

(2, 1)

In [20]:
predict('Scotland', 'Czech Republic', False)

(1, 0)

In [21]:
predict('Poland', 'Slovakia')

(0, 0)

In [22]:
predict('Spain', 'Sweden', False)

(3, 0)

In [23]:
predict('Hungary', 'Portugal', False)

(0, 2)

In [24]:
predict('Germany', 'France', False)

(0, 1)

# Summary
We have seen how a very simple match predicting model can be trained with XGBoost. I do not expect accuracy to be very high here, but it is a starting point for making more advanced models with more features.

## 