## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import sys
from matplotlib import pyplot as plt

## Data Cleansing

In [2]:
rankings = pd.read_csv("fifa_ranking.csv")
rankings = rankings.replace({"IR Iran": "Iran"})
rankings['weighted_points'] = rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])

In [3]:
matches = pd.read_csv("results.csv")
matches =  matches.replace({'Germany DR': 'Germany', 'China': 'China PR'})
matches['neutral'] =  matches['neutral'].replace({'TRUE': 1, 'FALSE': 0})
matches['date'] = pd.to_datetime(matches['date'])

In [4]:
world_cup = pd.read_csv("WorldCup2018Dataset.csv")
world_cup = world_cup.dropna(how='all')
world_cup = world_cup.replace({"IRAN": "Iran", 
                               "Costarica": "Costa Rica", 
                               "Porugal": "Portugal", 
                               "Columbia": "Colombia", 
                               "Korea" : "Korea Republic"})

In [5]:
rankings = rankings.set_index(['rank_date'])\
            .groupby(['country_full'], group_keys=False)\
            .resample('D').first()\
            .fillna(method='ffill')\
            .reset_index()

## Merging Dataframes

In [6]:
matches = matches.merge(rankings, 
                        left_on=['date', 'home_team'], 
                        right_on=['rank_date', 'country_full'])
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date,...,cur_year_avg,cur_year_avg_weighted,last_year_avg,last_year_avg_weighted,two_year_ago_avg,two_year_ago_weighted,three_year_ago_avg,three_year_ago_weighted,confederation,weighted_points
0,1993-08-08,Bolivia,Uruguay,3,1,FIFA World Cup qualification,La Paz,Bolivia,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0
1,1993-08-08,Brazil,Mexico,1,1,Friendly,Maceió,Brazil,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0
2,1993-08-08,Ecuador,Venezuela,5,0,FIFA World Cup qualification,Quito,Ecuador,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0
3,1993-08-08,Guinea,Sierra Leone,1,0,Friendly,Conakry,Guinea,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CAF,0.0
4,1993-08-08,Paraguay,Argentina,1,3,FIFA World Cup qualification,Asunción,Paraguay,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0


In [7]:
matches.head().T

Unnamed: 0,0,1,2,3,4
date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00
home_team,Bolivia,Brazil,Ecuador,Guinea,Paraguay
away_team,Uruguay,Mexico,Venezuela,Sierra Leone,Argentina
home_score,3,1,5,1,1
away_score,1,1,0,0,3
tournament,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,Friendly,FIFA World Cup qualification
city,La Paz,Maceió,Quito,Conakry,Asunción
country,Bolivia,Brazil,Ecuador,Guinea,Paraguay
neutral,False,False,False,False,False
rank_date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00


In [8]:
matches = matches.merge(rankings, 
                        left_on=['date', 'away_team'], 
                        right_on=['rank_date', 'country_full'], 
                        suffixes=('_home', '_away'))
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date_home,...,cur_year_avg_away,cur_year_avg_weighted_away,last_year_avg_away,last_year_avg_weighted_away,two_year_ago_avg_away,two_year_ago_weighted_away,three_year_ago_avg_away,three_year_ago_weighted_away,confederation_away,weighted_points_away
0,1993-08-08,Bolivia,Uruguay,3,1,FIFA World Cup qualification,La Paz,Bolivia,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0
1,1993-08-08,Brazil,Mexico,1,1,Friendly,Maceió,Brazil,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONCACAF,0.0
2,1993-08-08,Ecuador,Venezuela,5,0,FIFA World Cup qualification,Quito,Ecuador,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0
3,1993-08-08,Guinea,Sierra Leone,1,0,Friendly,Conakry,Guinea,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CAF,0.0
4,1993-08-08,Paraguay,Argentina,1,3,FIFA World Cup qualification,Asunción,Paraguay,False,1993-08-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,0.0


In [9]:
matches['rank_difference'] = matches['rank_home'] - matches['rank_away']
matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2
matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away']
matches['score_difference'] = matches['home_score'] - matches['away_score']
matches['is_won'] = matches['score_difference'] > 0 # take draw as lost
matches['is_stake'] = matches['tournament'] != 'Friendly'

matches.sample(5).T

Unnamed: 0,4327,6446,15200,940,11808
date,2000-06-25 00:00:00,2003-05-21 00:00:00,2014-01-04 00:00:00,1995-07-30 00:00:00,2009-12-05 00:00:00
home_team,New Zealand,Uganda,Namibia,Angola,Kenya
away_team,Solomon Islands,Sudan,Ghana,Botswana,Ethiopia
home_score,2,0,0,4,2
away_score,0,0,1,0,0
tournament,Oceania Nations Cup,Friendly,Friendly,African Cup of Nations qualification,CECAFA Cup
city,Papeete,Kampala,Windhoek,Luanda,Nairobi
country,French Polynesia,Uganda,Namibia,Angola,Kenya
neutral,True,False,False,False,False
rank_date_home,2000-06-25 00:00:00,2003-05-21 00:00:00,2014-01-04 00:00:00,1995-07-30 00:00:00,2009-12-05 00:00:00


## Checking Data

### Check whether there was any matches held in the cities which are the venues of the 2018 world cup.

In [10]:
world_cup_2018_cities = ["Moscow", "Saint Petersburg", "Sochi", "Rostov-on-Don", "Volgograd", "Nizhny Novgorod", "Kazan", "Samara", "Saransk", "Kaliningrad", "	Yekaterinburg"]
matches[matches["city"].isin(world_cup_2018_cities)].T

Unnamed: 0,330,473,513,728,792,1029,1077,1363,1387,1554,...,17825,17826,17834,17837,17849,18050,18101,18148,18302,18460
date,1994-05-29 00:00:00,1994-09-07 00:00:00,1994-10-12 00:00:00,1995-03-29 00:00:00,1995-05-06 00:00:00,1995-10-11 00:00:00,1995-11-15 00:00:00,1996-05-29 00:00:00,1996-06-02 00:00:00,1996-08-28 00:00:00,...,2017-06-25 00:00:00,2017-06-25 00:00:00,2017-06-28 00:00:00,2017-06-29 00:00:00,2017-07-02 00:00:00,2017-10-07 00:00:00,2017-10-10 00:00:00,2017-11-11 00:00:00,2018-03-23 00:00:00,2018-06-05 00:00:00
home_team,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,...,Chile,Germany,Portugal,Germany,Portugal,Russia,Russia,Russia,Russia,Russia
away_team,Slovakia,Germany,San Marino,Scotland,Faroe Islands,Greece,Finland,United Arab Emirates,Poland,Brazil,...,Australia,Cameroon,Chile,Mexico,Mexico,Korea Republic,Iran,Argentina,Brazil,Turkey
home_score,2,0,4,0,3,2,3,1,2,2,...,1,3,0,4,2,4,1,0,0,1
away_score,1,1,0,0,0,1,1,0,0,2,...,1,1,0,1,1,2,1,1,3,1
tournament,Friendly,Friendly,UEFA Euro qualification,UEFA Euro qualification,UEFA Euro qualification,UEFA Euro qualification,UEFA Euro qualification,Friendly,Friendly,Friendly,...,Confederations Cup,Confederations Cup,Confederations Cup,Confederations Cup,Confederations Cup,Friendly,Friendly,Friendly,Friendly,Friendly
city,Moscow,Moscow,Moscow,Moscow,Moscow,Moscow,Moscow,Moscow,Moscow,Moscow,...,Moscow,Sochi,Kazan,Sochi,Moscow,Moscow,Kazan,Moscow,Moscow,Moscow
country,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,...,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia,Russia
neutral,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,False,False,False,False,False
rank_date_home,1994-05-29 00:00:00,1994-09-07 00:00:00,1994-10-12 00:00:00,1995-03-29 00:00:00,1995-05-06 00:00:00,1995-10-11 00:00:00,1995-11-15 00:00:00,1996-05-29 00:00:00,1996-06-02 00:00:00,1996-08-28 00:00:00,...,2017-06-25 00:00:00,2017-06-25 00:00:00,2017-06-28 00:00:00,2017-06-29 00:00:00,2017-07-02 00:00:00,2017-10-07 00:00:00,2017-10-10 00:00:00,2017-11-11 00:00:00,2018-03-23 00:00:00,2018-06-05 00:00:00


In [11]:
matches[matches["rank_date_home"] != matches["rank_date_away"]]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date_home,...,three_year_ago_avg_away,three_year_ago_weighted_away,confederation_away,weighted_points_away,rank_difference,average_rank,point_difference,score_difference,is_won,is_stake


Hence, we drop one of the duplicates and rename the other one.

In [12]:
matches = matches.drop(columns = ["rank_date_away"])
matches.columns = matches.columns.str.replace('rank_date_home', 'rank_date')

In [13]:
matches[matches['country_full_home'] != matches['home_team']]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date,...,three_year_ago_avg_away,three_year_ago_weighted_away,confederation_away,weighted_points_away,rank_difference,average_rank,point_difference,score_difference,is_won,is_stake


In [14]:
matches[matches['country_full_away'] != matches['away_team']]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date,...,three_year_ago_avg_away,three_year_ago_weighted_away,confederation_away,weighted_points_away,rank_difference,average_rank,point_difference,score_difference,is_won,is_stake


In [15]:
matches = matches.drop(columns = ["country_full_home", "country_full_away"])

In [16]:
matches.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18469,18470,18471,18472,18473,18474,18475,18476,18477,18478
date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-09 00:00:00,1993-08-11 00:00:00,1993-08-11 00:00:00,...,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00
home_team,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,Zambia,Lesotho,England,Uruguay,Portugal,Korea Republic,Iceland,India
away_team,Uruguay,Mexico,Venezuela,Sierra Leone,Argentina,Colombia,Swaziland,Sierra Leone,Norway,Switzerland,...,Egypt,Hungary,Madagascar,Zimbabwe,Costa Rica,Uzbekistan,Algeria,Bolivia,Ghana,New Zealand
home_score,3,1,5,1,1,0,2,4,0,1,...,3,1,1,0,2,3,3,0,2,1
away_score,1,1,0,0,3,1,0,0,7,2,...,0,1,0,0,0,0,0,0,2,2
tournament,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,FIFA World Cup qualification,Friendly,Friendly,Friendly,Friendly,...,Friendly,Friendly,COSAFA Cup,COSAFA Cup,Friendly,Friendly,Friendly,Friendly,Friendly,Friendly
city,La Paz,Maceió,Quito,Conakry,Asunción,Lima,Harare,Conakry,Toftir,Borås,...,Brussels,Brest,Polokwane,Polokwane,Leeds,Montevideo,Lisbon,Innsbruck,Reykjavík,Mumbai
country,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,South Africa,South Africa,England,Uruguay,Portugal,Austria,Iceland,India
neutral,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,True,False,False
rank_date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-09 00:00:00,1993-08-11 00:00:00,1993-08-11 00:00:00,...,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00


In [17]:
def isOneToOne(df, col1, col2):
    first = df.drop_duplicates([col1, col2]).groupby(col1)[col2].count().max()
    second = df.drop_duplicates([col1, col2]).groupby(col2)[col1].count().max()
    return first + second == 2

In [18]:
isOneToOne(matches, "country_abrv_home", "home_team")

True

In [19]:
isOneToOne(matches, "country_abrv_away", "away_team")

True

In [20]:
matches = matches.drop(columns = ["country_abrv_home", "country_abrv_away"])

In [21]:
matches.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18469,18470,18471,18472,18473,18474,18475,18476,18477,18478
date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-09 00:00:00,1993-08-11 00:00:00,1993-08-11 00:00:00,...,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00
home_team,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,Zambia,Lesotho,England,Uruguay,Portugal,Korea Republic,Iceland,India
away_team,Uruguay,Mexico,Venezuela,Sierra Leone,Argentina,Colombia,Swaziland,Sierra Leone,Norway,Switzerland,...,Egypt,Hungary,Madagascar,Zimbabwe,Costa Rica,Uzbekistan,Algeria,Bolivia,Ghana,New Zealand
home_score,3,1,5,1,1,0,2,4,0,1,...,3,1,1,0,2,3,3,0,2,1
away_score,1,1,0,0,3,1,0,0,7,2,...,0,1,0,0,0,0,0,0,2,2
tournament,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,FIFA World Cup qualification,Friendly,Friendly,Friendly,Friendly,...,Friendly,Friendly,COSAFA Cup,COSAFA Cup,Friendly,Friendly,Friendly,Friendly,Friendly,Friendly
city,La Paz,Maceió,Quito,Conakry,Asunción,Lima,Harare,Conakry,Toftir,Borås,...,Brussels,Brest,Polokwane,Polokwane,Leeds,Montevideo,Lisbon,Innsbruck,Reykjavík,Mumbai
country,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,South Africa,South Africa,England,Uruguay,Portugal,Austria,Iceland,India
neutral,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,True,False,False
rank_date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-09 00:00:00,1993-08-11 00:00:00,1993-08-11 00:00:00,...,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00


In [22]:
matches[matches['date'] != matches['rank_date']]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date,...,three_year_ago_avg_away,three_year_ago_weighted_away,confederation_away,weighted_points_away,rank_difference,average_rank,point_difference,score_difference,is_won,is_stake


In [23]:
matches = matches.drop(columns = ["rank_date"])

In [24]:
cat_cols = ['date', 'home_team', 'away_team', 
            "tournament", "city", "country", 
            "confederation_home", "confederation_away", 
            'is_won', 'is_stake']

from sklearn.preprocessing import LabelEncoder


for col in cat_cols:
    encoder = LabelEncoder()
    if col == "home_team" or col == "away_team":
        encoder.fit(np.concatenate([matches["home_team"], matches["away_team"]]))
        matches["home_team_ordinal"] = encoder.transform(matches["home_team"])
        matches["away_team_ordinal"] = encoder.transform(matches["away_team"])
        continue
    encoder.fit(matches[col])
    matches[f"{col}_ordinal"] = encoder.transform(matches[f"{col}"])

In [25]:
matches.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18469,18470,18471,18472,18473,18474,18475,18476,18477,18478
date,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-08 00:00:00,1993-08-09 00:00:00,1993-08-11 00:00:00,1993-08-11 00:00:00,...,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-06 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00,2018-06-07 00:00:00
home_team,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,Zambia,Lesotho,England,Uruguay,Portugal,Korea Republic,Iceland,India
away_team,Uruguay,Mexico,Venezuela,Sierra Leone,Argentina,Colombia,Swaziland,Sierra Leone,Norway,Switzerland,...,Egypt,Hungary,Madagascar,Zimbabwe,Costa Rica,Uzbekistan,Algeria,Bolivia,Ghana,New Zealand
home_score,3,1,5,1,1,0,2,4,0,1,...,3,1,1,0,2,3,3,0,2,1
away_score,1,1,0,0,3,1,0,0,7,2,...,0,1,0,0,0,0,0,0,2,2
tournament,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,Friendly,FIFA World Cup qualification,FIFA World Cup qualification,Friendly,Friendly,Friendly,Friendly,...,Friendly,Friendly,COSAFA Cup,COSAFA Cup,Friendly,Friendly,Friendly,Friendly,Friendly,Friendly
city,La Paz,Maceió,Quito,Conakry,Asunción,Lima,Harare,Conakry,Toftir,Borås,...,Brussels,Brest,Polokwane,Polokwane,Leeds,Montevideo,Lisbon,Innsbruck,Reykjavík,Mumbai
country,Bolivia,Brazil,Ecuador,Guinea,Paraguay,Peru,Zimbabwe,Guinea,Faroe Islands,Sweden,...,Belgium,Belarus,South Africa,South Africa,England,Uruguay,Portugal,Austria,Iceland,India
neutral,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,True,False,False
rank_home,59.0,8.0,35.0,65.0,67.0,70.0,50.0,65.0,111.0,4.0,...,3.0,79.0,78.0,150.0,12.0,14.0,4.0,57.0,22.0,97.0


In [26]:
matches.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'rank_home',
       'total_points_home', 'previous_points_home', 'rank_change_home',
       'cur_year_avg_home', 'cur_year_avg_weighted_home', 'last_year_avg_home',
       'last_year_avg_weighted_home', 'two_year_ago_avg_home',
       'two_year_ago_weighted_home', 'three_year_ago_avg_home',
       'three_year_ago_weighted_home', 'confederation_home',
       'weighted_points_home', 'rank_away', 'total_points_away',
       'previous_points_away', 'rank_change_away', 'cur_year_avg_away',
       'cur_year_avg_weighted_away', 'last_year_avg_away',
       'last_year_avg_weighted_away', 'two_year_ago_avg_away',
       'two_year_ago_weighted_away', 'three_year_ago_avg_away',
       'three_year_ago_weighted_away', 'confederation_away',
       'weighted_points_away', 'rank_difference', 'average_rank',
       'point_difference', 'score_difference', 'is_won', 'is_stake',
       '

In [27]:
info_cont_feat = ['rank_difference', 'average_rank', 'point_difference', 'is_stake_ordinal']
info_cate_feat = ['tournament_ordinal', 'city_ordinal', 'country_ordinal']

home_cont_feat = ['rank_home', 'total_points_home',
                  'cur_year_avg_home', 'cur_year_avg_weighted_home', 'last_year_avg_home',
                  'last_year_avg_weighted_home', 'two_year_ago_avg_home',
                  'two_year_ago_weighted_home', 'three_year_ago_avg_home',
                  'three_year_ago_weighted_home', 'weighted_points_home']
home_cate_feat = ['home_team_ordinal', 'confederation_home_ordinal']

away_cont_feat = ['rank_away', 'total_points_away',
                  'cur_year_avg_away', 'cur_year_avg_weighted_away', 'last_year_avg_away',
                  'last_year_avg_weighted_away', 'two_year_ago_avg_away',
                  'two_year_ago_weighted_away', 'three_year_ago_avg_away',
                  'three_year_ago_weighted_away', 'weighted_points_away']
away_cate_feat = ['away_team_ordinal', 'confederation_away_ordinal']

In [28]:
X = [matches[info_cont_feat].values, 
     matches[info_cate_feat].values, 
     matches[home_cont_feat].values, 
     matches[home_cate_feat].values,
     matches[away_cont_feat].values,
     matches[away_cate_feat].values]
y = matches['is_won_ordinal'].values

In [29]:
len(matches['home_team_ordinal'].unique()) == len(matches['away_team_ordinal'].unique())

True

In [30]:
len(matches['confederation_home_ordinal'].unique()) == len(matches['confederation_away_ordinal'].unique())

True

In [31]:
n_teams = len(matches['home_team_ordinal'].unique())
n_confederations = len(matches['confederation_home_ordinal'].unique())
n_tournaments = len(matches['tournament_ordinal'].unique())
n_cities = len(matches['city_ordinal'].unique())
n_countries = len(matches['country_ordinal'].unique())

In [32]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

In [33]:
class TripleTowersModel(keras.Model):
    def __init__(self, n_teams, n_confederations, n_tournaments, n_cities, n_countries,
                 embedding_size=200, dense_size=128, embed_reg=1, dense_reg=1, fc_reg=1, **kwargs):
        super(TripleTowersModel, self).__init__(**kwargs)
        self.n_teams = n_teams
        self.n_confederations = n_confederations
        self.n_tournaments = n_tournaments
        self.n_cities = n_cities
        self.n_countries = n_countries
        
        ## Embedding layers
        self.home_embedding = layers.Embedding(
            self.n_teams,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.away_embedding = layers.Embedding(
            self.n_teams,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.home_confederations_embedding = layers.Embedding(
            self.n_confederations,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.away_confederations_embedding = layers.Embedding(
            self.n_confederations,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.tournaments_embedding = layers.Embedding(
            self.n_tournaments,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.cities_embedding = layers.Embedding(
            self.n_cities,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        self.countries_embedding = layers.Embedding(
            self.n_countries,
            embedding_size,
            keras.initializers.he_normal(seed=None),
            embeddings_regularizer=keras.regularizers.l2(embed_reg),
        )
        ##  Mapping layers
        self.home_dense = layers.Dense(dense_size, name='home_dense', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(dense_reg))
        self.away_dense = layers.Dense(dense_size, name='away_dense', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(dense_reg))
        self.info_dense = layers.Dense(dense_size, name='info_dense', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(dense_reg))
        self.joint_dense = layers.Dense(dense_size, name='joint_dense', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(dense_reg))
        self.fc1 = layers.Dense(round(dense_size), name='fc1', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(fc_reg))
        self.fc2 = layers.Dense(round(dense_size/2), name='fc2', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(fc_reg))
        self.fc3 = layers.Dense(round(dense_size/4), name='fc3', activation='relu', 
                                       bias_regularizer=keras.regularizers.L2(fc_reg))
        self.out_dense = layers.Dense(1, name='out_dense', activation='sigmoid')

    def call(self, inputs):
        ## cate/cont data
        info_cont_feat, info_cate_feat, home_cont_feat, home_cate_feat, away_cont_feat, away_cate_feat = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5]

        ## embedding
        home_vec = self.home_embedding(home_cate_feat[:,0])
        home_conf_vec = self.home_confederations_embedding(home_cate_feat[:,1])
        away_vec = self.away_embedding(away_cate_feat[:,0])
        away_conf_vec = self.away_confederations_embedding(away_cate_feat[:,1])
        tour_vec = self.tournaments_embedding(info_cate_feat[:,0])
        city_vec = self.cities_embedding(info_cate_feat[:,1])
        cont_vec = self.countries_embedding(info_cate_feat[:,2])

        ## dense mapping
        home_all_vec = layers.Concatenate()([home_cont_feat, home_vec, home_conf_vec])
        item_all_vec = layers.Concatenate()([away_cont_feat, away_vec, away_conf_vec])
        info_all_vec = layers.Concatenate()([away_cont_feat, away_vec, away_conf_vec])

        home_dense_vec = self.home_dense(home_all_vec)
        away_dense_vec = self.away_dense(item_all_vec)
        info_dense_vec = self.info_dense(info_all_vec)

        ## joint dense
        joint_vec = layers.Concatenate()([home_dense_vec, away_dense_vec, info_dense_vec])
        fc1_vec = self.joint_dense(joint_vec)
        fc2_vec = self.fc1(fc1_vec)
        fc3_vec = self.fc2(fc2_vec)
        out = self.out_dense(fc3_vec)
        return out

In [34]:
from sklearn.model_selection import KFold
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

class TripleTowersModel_GridSearchKFoldCV(object):
    def __init__(self, n_teams, n_confederations, n_tournaments, n_cities, cv=5, 
                 embed_regs=[1e-3, 1e-2], 
                 dense_regs=[1e-3, 1e-2], 
                 fc_regs=[1e-3, 1e-2],
                 embedding_sizes=[150, 200, 250, 300], 
                 dense_sizes=[150, 200, 250, 300],
                 lrs=[1e-4, 1e-3, 1e-2], batches=[64,128,256]):
        self.n_teams = n_teams
        self.n_confederations = n_confederations
        self.n_tournaments = n_tournaments
        self.n_cities = n_cities
        self.n_countries = n_countries
        self.cv = cv
        self.embed_regs = embed_regs
        self.dense_regs = dense_regs
        self.fc_regs = fc_regs
        self.embedding_sizes = embedding_sizes
        self.dense_sizes = dense_sizes
        self.lrs = lrs
        self.batches = batches
        self.best_model = {}
        self.cv_result = {'embedding_size': [], 'dense_size': [], 
                          'embed_reg': [], 'dense_reg': [], 'fc_reg': [], 
                          'lr': [], 'batch': [], 'train_auc': [], 'valid_auc': []}

    def grid_search(self, train_input, train_rating):
        ## generate all combinations
        kf = KFold(n_splits=self.cv, shuffle=True)
        for (embedding_size, dense_size, embed_reg, dense_reg, fc_reg, lr, batch) in itertools.product(self.embedding_sizes, self.dense_sizes, self.embed_regs, self.dense_regs, self.fc_regs, self.lrs, self.batches):
            train_auc_tmp, valid_auc_tmp = 0., 0.
            for train_index, valid_index in kf.split(train_input[1]):
                # produce training/validation sets
                train_input_cv = []
                valid_input_cv = []
                for i in range(6):
                    train_input_cv.append(train_input[i][train_index])
                train_rating_cv = train_rating[train_index]
                for i in range(6):
                    valid_input_cv.append(train_input[i][valid_index])
                valid_rating_cv = train_rating[valid_index]
                # fit the model based on CV data
                model = TripleTowersModel(self.n_teams, self.n_confederations, self.n_tournaments, self.n_cities, self.n_countries, 
                                          embedding_size=embedding_size, dense_size=dense_size,
                                          embed_reg=embed_reg, dense_reg=dense_reg, fc_reg=fc_reg)
                metrics = [keras.metrics.AUC(name='auc')]

                model.compile(optimizer=keras.optimizers.Adam(lr), loss=tf.keras.losses.BinaryCrossentropy(), metrics=metrics)

                callbacks = [keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0, patience=5, verbose=1, 
                                                           mode='auto', baseline=None, restore_best_weights=True)]

                history = model.fit(x=train_input_cv, y=train_rating_cv, batch_size=batch, epochs=100, verbose=1, callbacks=callbacks, validation_data=(valid_input_cv, valid_rating_cv))

                train_auc_tmp_cv = history.history["auc"][-1]
                valid_auc_tmp_cv = history.history["val_auc"][-1]
                train_auc_tmp = train_auc_tmp + train_auc_tmp_cv / self.cv
                valid_auc_tmp = valid_auc_tmp + valid_auc_tmp_cv / self.cv
                print(f'{self.cv}-Fold CV for embedding size: {embedding_size}; dense size: {dense_size}; embed reg: {embed_reg}; dense reg: {dense_reg}; fc reg: {fc_reg}; learning rate: {lr}; batch size: {batch}, train_auc: {train_auc_tmp_cv}, valid_auc: {valid_auc_tmp_cv}')
            self.cv_result['embedding_size'].append(embedding_size)
            self.cv_result['dense_size'].append(dense_size)
            self.cv_result['embed_reg'].append(embed_reg)
            self.cv_result['dense_reg'].append(dense_reg)
            self.cv_result['fc_reg'].append(fc_reg)
            self.cv_result['lr'].append(lr)
            self.cv_result['batch'].append(batch)
            self.cv_result['train_auc'].append(train_auc_tmp)
            self.cv_result['valid_auc'].append(valid_auc_tmp)
        self.cv_result = pd.DataFrame.from_dict(self.cv_result)
        best_ind = self.cv_result['valid_auc'].argmin()
        self.best_model = self.cv_result.loc[best_ind]
        
    def plot_grid(self, data_source='valid'):
        sns.set_theme()
        if data_source == 'train':
            cv_pivot = self.cv_result.pivot("embedding_size", "dense_size", "embed_reg", "dense_reg", "fc_reg",
                                            "lr", "batch", "train_auc")
        elif data_source == 'valid':
            cv_pivot = self.cv_result.pivot("embedding_size", "dense_size", "embed_reg", "dense_reg", "fc_reg",
                                            "lr", "batch", "valid_auc")
        else:
            raise ValueError('data_source must be train or valid!')
        sns.heatmap(cv_pivot, annot=True, fmt=".3f", linewidths=.5, cmap="YlGnBu")
        plt.show()

In [None]:
ttm_cv = TripleTowersModel_GridSearchKFoldCV(n_teams, n_confederations, n_tournaments, n_cities, n_countries)

ttm_cv.grid_search(X, y)

2023-03-22 00:45:12.053177: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
212-Fold CV for embedding size: 150; dense size: 150; embed reg: 0.001; dense reg: 0.001; fc reg: 0.001; learning rate: 0.0001; batch size: 64, train_auc: 0.7447144985198975, valid_auc: 0.8243383765220642
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
212-Fold CV for embedding size: 150; dense size: 150; embed reg: 0.001; dense reg: 0.001; fc reg: 0.001; learning rate: 0.0001; batch size: 64, train_auc: 0.7376014590263367, valid_auc: 0.7194616794586182
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
212-Fold CV for embedding size: 150; dense size: 150; embed reg: 0.001; dense reg: 0.001; fc reg: 0.001; learning rate: 0.0001; batch size: 64, train_auc: 0.7464641332626343, valid_auc: 0.723023533821106
E

In [None]:
model = TripleTowersModel(n_teams, n_confederations, n_tournaments, n_cities, n_countries, 
                          embedding_size=150, dense_size=64, embed_reg=1e-1, dense_reg=1e-1, fc_reg=1e-1)

In [None]:
metrics = [keras.metrics.AUC(name='AUC')]

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=metrics)

callbacks = [keras.callbacks.EarlyStopping(monitor='AUC', min_delta=0, patience=5, verbose=1, 
                                           mode='auto', baseline=None, restore_best_weights=True)]
history = model.fit(x=X, y=y, batch_size=128, epochs=100, verbose=1, callbacks=callbacks)


In [None]:
y_pred = model.predict(X)

In [None]:
def plot_roc(y_true, y_prob, n_threshold=20):
    thresholds = np.linspace(1, 0, n_threshold)
    tpr = []
    fpr = []
    for threshold in thresholds:
        tp = ((y_true == 1) & (y_prob >= threshold)).sum()
        fp = ((y_true == 0) & (y_prob >= threshold)).sum()
        tn = ((y_true == 0) & (y_prob < threshold)).sum()
        fn = ((y_true == 1) & (y_prob < threshold)).sum()
        tpr.append(tp / (tp + fn))
        fpr.append(fp / (fp + tn))
    print(f'{tpr}, {fpr}')
    auc = np.trapz(tpr,fpr)
    plt.plot(fpr,tpr,label='AUC = ' + str(round(auc,3)))
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()
    return tpr, fpr

In [None]:
plot_roc(y, y_pred)