# World cup 2018 model

Importerer pakker

In [75]:
import json
import requests
import re
import pickle
import itertools
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pymc3 as pm
import collections
from pathlib import Path
from typing import List
import math
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from IPython.display import HTML


import menon_styles



%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('float_format', '{:.2f}'.format)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
menon_styles.menon_styles()

Definerer er test-parameter for å slå testene av og på

In [4]:
TEST = True

## Teams and fixtures

### Skraper fixture

In [108]:
def get_fixture(use_pickle=True):
    pickle_file = Path('../data/fixture.pkl')
    if use_pickle:
        try:
            games_df = pd.read_pickle(pickle_file)
        except FileNotFoundError:
            games_df = get_fixture(use_pickle=False)
    else:        
        r = requests.get('http://www.skysports.com/football/news/12098/11154890/world-cup-fixtures-the-full-schedule-for-russia-2018')
        soup = BeautifulSoup(r.text, 'lxml')
        outer = soup.find("div", {"class": "article__body article__body--lead"})
        games = [p.text for p in outer.findAll("p") if ' v ' in p.text]
        def get_info(game):
            info = {}
            game = game.replace('Tues', 'Tue')#.replace('South Korea', 'Korea')
            date, rest = game.split(':')
            teams, rest = rest.strip().split(' - ', maxsplit=1)
            home, teams_rest = teams.split(' v ')
            away = teams_rest.split(' (')[0]
            try:
                group = re.search(r'\((.*)\)', teams).group(1)
            except AttributeError:
                try:
                    group = re.findall(r'\((.*)\)', game)[-1]
                except IndexError:
                    group = ''
            #group = re.search(r'([(Group ?)|(Match ?)][A-Z\d]{1,2})', game).group(0)
            return dict(date=pd.to_datetime(date + ' 2018').date(), home=home, away=away, group=group)
        games_df = pd.DataFrame([get_info(game) for game in games])
        games_df.loc[games_df['group'] == 'Luzhniki), 3pm (Match 51', 'group'] = 'Match 51'
        games_df.loc[games_df['group'] == 'Spartak), 7pm (Match 56', 'group'] = 'Match 56'
        games_df.loc[60, 'group'] = 'Match 61'
        games_df.loc[61, 'group'] = 'Match 62'
        finals = [{'home': 'Loser match 61', 'away': 'Loser match 62', 'date': pd.to_datetime('Sat July 14 2018').date(), 'group': 'Match 63'},
                  {'home': 'Winner match 61', 'away': 'Winner match 62', 'date': pd.to_datetime('Sun July 15 2018').date(), 'group': 'Match 64'}]
        games_df = pd.concat([games_df, pd.DataFrame(finals)], ignore_index=True)
        games_df['match'] = games_df.index + 1
        games_df.loc[games_df['group'].str.contains('Match'), 'group'] = 'Playoffs'
        games_df['home_score'] = None
        games_df['away_score'] = None
        games_df.to_pickle(pickle_file)
    
    return games_df



In [109]:
fixture = get_fixture(use_pickle=False)
fixture

Unnamed: 0,away,date,group,home,match,home_score,away_score
0,Saudi Arabia,2018-06-14,Group A,Russia,1,,
1,Uruguay,2018-06-15,Group A,Egypt,2,,
2,Iran,2018-06-15,Group B,Morocco,3,,
3,Spain,2018-06-15,Group B,Portugal,4,,
4,Australia,2018-06-16,Group C,France,5,,
5,Iceland,2018-06-16,Group D,Argentina,6,,
6,Denmark,2018-06-16,Group C,Peru,7,,
7,Nigeria,2018-06-16,Group D,Croatia,8,,
8,Serbia,2018-06-17,Group E,Costa Rica,9,,
9,Mexico,2018-06-17,Group F,Germany,10,,


### Sjekker at vi har de lagene som trengs for simulering

In [7]:
teams = pd.read_pickle(Path('../data/teams.pkl'))
unique_teams = set(fixture.loc[:47, 'away'].values) | set(fixture.loc[:47, 'home'].values)
lacking = unique_teams - set(teams.team.values)
assert(len(lacking) == 0)
entering_teams = unique_teams & set(teams.team.values)

In [29]:
Path('../data/all_teams.json').write_text(json.dumps(list(teams.team.values)))

3155

### Lager grupper

In [8]:
groups = (fixture
          .append(
              fixture
              .drop(columns='home')
              .rename(columns={'away': 'home'})
              )
          .loc[fixture.group.str.contains('Group'), ['group', 'home']]
          .rename(columns={'home': 'team'})
          .drop_duplicates()
          .sort_values(by=['group', 'team'])
          .reset_index(drop=True)
          .apply(lambda col: col.str.replace('Group ', ''))
          .groupby('group').agg(lambda vals: list(vals))
          .to_dict()['team']
         )
Path('../data/groups.json').write_text(json.dumps(groups))
groups

{'A': ['Egypt', 'Russia', 'Saudi Arabia', 'Uruguay'],
 'B': ['Iran', 'Morocco', 'Portugal', 'Spain'],
 'C': ['Australia', 'Denmark', 'France', 'Peru'],
 'D': ['Argentina', 'Croatia', 'Iceland', 'Nigeria'],
 'E': ['Brazil', 'Costa Rica', 'Serbia', 'Switzerland'],
 'F': ['Germany', 'Mexico', 'South Korea', 'Sweden'],
 'G': ['Belgium', 'England', 'Panama', 'Tunisia'],
 'H': ['Colombia', 'Japan', 'Poland', 'Senegal']}

## The models

### Outcome class for storing game outcomes

In [116]:
class Outcome:
    def __init__(self, home, away, home_goals, away_goals, date=None, can_draw=True, to_overtime=False):
        self.home = home
        self.away = away
        self.home_goals = home_goals
        self.away_goals = away_goals
        self.date = date
        self.can_draw = can_draw
        self.to_overtime = to_overtime
        
        if home_goals > away_goals:
            self.winner, self.loser = self.home, self.away
        elif away_goals > home_goals:
            self.winner, self.loser = self.away, self.home
        else:
            self.winner = None
            
    
    def __repr__(self):
        return f"Outcome(home={self.home}, away={self.away}, home_goals={self.home_goals}, away_goals={self.away_goals}, can_draw={self.can_draw}, to_overtime={self.to_overtime})"
    
    @property
    def stats(self):
        winner = self.winner
        return {self.home: self.home_goals, self.away: self.away_goals, 'stats': {'winner': winner, 'date': self.date, 'can_draw': self.can_draw, 'to_overtime': self.to_overtime}}
    
        
    @staticmethod
    def _score_points(own_goals, other_goals):
        if own_goals > other_goals:
            return 3
        elif own_goals < other_goals:
            return 0
        return 1
    
    @property
    def teams(self):
        return [self.home, self.away]
    
    @property
    def home_stats(self):
        return stats(repr(self.home), self.home_goals, self.away_goals, self._score_points(self.home_goals, self.away_goals))
    
    @property
    def away_stats(self):
        return stats(repr(self.away), self.away_goals, self.home_goals, self._score_points(self.away_goals, self.home_goals))

        

### Groups

Read advancement rules here if in doubt: https://www.uefa.com/MultimediaFiles/Download/Regulations/uefaorg/Regulations/01/87/54/21/1875421_DOWNLOAD.pdf

In [129]:
class Group:
    def __init__(self, group_num, match):
        self.fixture = get_fixture().query("group == @group_num")
        self.match = match
        self.games = self.gen_fixture()
        self.ranking = self.play()

        
    
    def gen_fixture(self):
        out = []
        for game in self.fixture.to_dict(orient='index').values():
            if game['home_score'] == None: #Not yet played
                out.append(self.match(game['home'], game['away'], date=game['date'], can_draw=True))
            else: #Game is played, just recording the outcome
                out.append(Outcome(game['home'], game['away'], game['home_score'], game['away_score']))
        return out
    
    def play(self):
        results = []
        for g in self.games:
            results.append(g.home_stats)
            results.append(g.away_stats)
        self.results = pd.DataFrame(results).assign(goals_diff = lambda df: df.goals_scored - df.goals_admitted)
        
        
        return (self.results
                .groupby('team')
                .sum()
                .sort_values(by=['points', 'goals_scored', 'goals_diff'], ascending=[False, False, False])
                [['points', 'goals_scored', 'goals_diff', 'goals_admitted']]
                .reset_index()
               )
    
    @property
    def winner(self):
        return eval(self.ranking.loc[0, 'team'])
   
    @property
    def second(self):
        return eval(self.ranking.loc[1, 'team'])
        
    
if TEST:        
    g = Group('Group A', match=m)
    print(g.ranking)
    print(f'The group winner is {g.winner}')

             team  points  goals_scored  goals_diff  goals_admitted
0        'Russia'       7             7           5               2
1       'Uruguay'       6             3           0               3
2         'Egypt'       2             2          -1               3
3  'Saudi Arabia'       1             1          -4               5
The group winner is Russia


### Tournament

In [131]:
class Playoffs:
    """
    The World Cup 2018 playoff structure. To be used both in WC simulations, and when the WC is underway.
    The team dict has to have keys "A1" etc for every group (A, ..., H) and 1 and 2. Values are country names.
    """
    def __init__(self, team_dict, match, intercept: float = 0, home_advantage: float = 0):
        necessary_keys = [''.join([el[0], el[1]]) for el in itertools.product(list('ABCDEFGH'), list('12'))]
        for key in necessary_keys:
            assert key in team_dict.keys(), f"{key} is not in the team_dict passed on from the group play"
        self.team_dict = team_dict
        self.match = match
        self.intercept = intercept
        self.home_advantage = home_advantage
        
        self.play()
        
    def play(self):
        playoffs = {}
        # 1/8 finals
        playoffs[49] = self.match(self.team_dict['D2'], self.team_dict['C1'], can_draw=False)
        playoffs[50] = self.match(self.team_dict['B2'], self.team_dict['A1'], can_draw=False)
        playoffs[51] = self.match(self.team_dict['A2'], self.team_dict['B1'], can_draw=False)
        playoffs[52] = self.match(self.team_dict['C2'], self.team_dict['D1'], can_draw=False)
        playoffs[53] = self.match(self.team_dict['F2'], self.team_dict['C1'], can_draw=False)
        playoffs[54] = self.match(self.team_dict['H2'], self.team_dict['G1'], can_draw=False)
        playoffs[55] = self.match(self.team_dict['E2'], self.team_dict['F1'], can_draw=False)
        playoffs[56] = self.match(self.team_dict['G2'], self.team_dict['H1'], can_draw=False)
        
        # 1/4 finals
        playoffs[57] = self.match(playoffs[50].winner, playoffs[49].winner, can_draw=False)
        playoffs[58] = self.match(playoffs[54].winner, playoffs[53].winner, can_draw=False)
        playoffs[59] = self.match(playoffs[56].winner, playoffs[55].winner, can_draw=False)
        playoffs[60] = self.match(playoffs[52].winner, playoffs[51].winner, can_draw=False)
        
        # Semi finals
        playoffs[61] = self.match(playoffs[58].winner, playoffs[57].winner, can_draw=False)
        playoffs[62] = self.match(playoffs[60].winner, playoffs[59].winner, can_draw=False)
        
        # Bronze final
        playoffs[63] = self.match(playoffs[62].loser, playoffs[61].loser, can_draw=False)
        
        # Final
        playoffs[64] = self.match(playoffs[62].winner, playoffs[61].winner, can_draw=False)
        
        self.playoffs = playoffs
        
        self.winner = playoffs[64].winner

    def get_placement(self, team):
        if team == self.playoffs[64].winner:
            return 1
        elif team == self.playoffs[64].loser:
            return 2
        elif team == self.playoffs[63].winner:
            return 3
        elif team == self.playoffs[63].loser:
            return 4
        elif team in list(itertools.chain.from_iterable([self.playoffs[x].teams for x in [57, 58, 59, 60]])):
            return 8
        elif team in list(itertools.chain.from_iterable([self.playoffs[x].teams for x in [49, 50, 51, 52, 53, 54, 55, 56]])):
            return 16
        else:
            return None


        

In [134]:
class WorldCup:
    """
    Simulates the world cup. Requires a team_list, which is a list of 32 Team instances. 
    Every nation that is in the World Cup has to be represented here.
    """
    def __init__(self, match):
        grouping = json.loads(Path('../data/groups.json').read_text())
        self.team_list = [team for teams in grouping.values() for team in teams]
        self.match = match
        self.play()
        
    
    def group_play(self):
        groups = {}
        for group in list('ABCDEFGH'):
            group_name = f"Group {group}"
            groups[group_name] = Group(group_name, match=self.match)
        self.groups = groups

    def play(self):
        self.group_play()

        self.playoffs = Playoffs(self.advancement, match=self.match)
               
    @property
    def advancement(self):
        adv = {}
        for group_name in self.groups.keys():
            group = group_name.split()[-1]
            adv[group+'1'] = self.groups[group_name].winner
            adv[group+'2'] = self.groups[group_name].second
        return adv
    
    def get_placement(self, team):
        place = self.playoffs.get_placement(team)
        if place is not None:
            return place
        return 24
        
    @property
    def stats(self):
        outcome = [(self.get_placement(team), team) for team in self.team_list]
        outcome = pd.DataFrame(outcome, columns=['rank', 'team']).sort_values(by='rank').set_index('team', drop=True)
        return outcome

                
        
if TEST:            
    entry_teams = [Team(team_name, np.random.random(), np.random.random()) for team_name in entering_teams]
    wc = WorldCup(m)
    print(wc.stats)

              rank
team              
Russia           1
Costa Rica       2
Denmark          3
England          4
Germany          8
Uruguay          8
Belgium          8
Argentina        8
Sweden          16
Poland          16
Croatia         16
Senegal         16
Australia       16
Spain           16
Portugal        16
Tunisia         24
Colombia        24
Japan           24
South Korea     24
Mexico          24
Panama          24
Egypt           24
Serbia          24
Brazil          24
Iceland         24
Peru            24
France          24
Morocco         24
Iran            24
Saudi Arabia    24
Switzerland     24
Nigeria         24


### Simulation wrapper

In [61]:
class Simulation:
    """
    Class for simulating the World Cup 2018 with different parameters
    """
    def __init__(self, predictors: list, n: int = 1):
        """
        Runs n simulations for each set of parameters in pars, and returns the aggregate statistics
        """
        self.predictors = predictors
        self.n = n
        self.sim = self.simulate()
        
    def simulate(self):
        out = pd.concat([WorldCup(pred).stats.T for pred in self.predictors for _ in range(self.n)], axis=0).reset_index(drop=True)
        return (pd
                 .concat([
                     out.describe().T.sort_values(by='50%'),
                     out.agg(lambda col: np.mean(col==1)).rename('share wins'),
                     out.agg(lambda col: np.mean(col<=4)).rename('share top 4'),
                     out.agg(lambda col: np.mean(col<24)).rename('share playoff')
                         ], axis=1)
                 .drop('count', axis='columns')
                 .merge(right=pd.DataFrame([(group, country) for group in groups.keys() for country in groups[group]], columns = ['group', 'country']),
                        left_index=True,
                        right_on='country'
                       )
                 .sort_values(by='share wins', ascending=False)
                 .reset_index(drop=True)
                 [['country', 'group', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'share wins', 'share top 4', 'share playoff']]
                )

    @property
    def group(self):
        colors = []
        for _, color in menon_styles.menon_farger.items():
            colors.append(tuple([int(255*el) for el in color]))

        return (self.sim
                .sort_values(by=['group', 'share playoff'], ascending=[True, False])
                .reset_index(drop=True)
                .style
                 .apply(lambda row: [f"background-color: {'#%02x%02x%02x'%colors[(ord(row.group)-1)%8]}"]*len(row), axis=1)
                 #.bar(subset=['share wins', 'share top 4', 'share playoff'], align='left', color=['#5fba7d'])
                 .format({'share wins': "{:.0%}", 'share top 4': "{:.0%}", 'share playoff': "{:.0%}"})
               )
    
    @property
    def style(self):
        return (self.sim
                .style
                .bar(subset=['share wins', 'share top 4', 'share playoff'], align='left', color=['#5fba7d'])
                .format({'share wins': "{:.0%}", 'share top 4': "{:.0%}", 'share playoff': "{:.0%}"})
                .set_properties(**{'border-color': 'black'})
               )


## Estimated simulators

### Attack and defense estimation

#### Teams

In [9]:
class Team:
    __slots__ = ['name', 'atts', 'defs', '_hash']
    
    def __init__(self, name: str, atts: float = 0, defs: float = 0):
        self.name = name
        self.atts = atts
        self.defs = defs
        self._hash = hash((self.name, self.atts, self.defs))
    
    def __hash__(self):
        return self._hash
        
    def __str__(self):
        return f"{self.name} - atts: {self.atts}, defs: {self.defs}"
    
    def __repr__(self):
        return f"Team('{self.name}', {self.atts}, {self.defs})"
    
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return hash(self) == hash(other)
        else:
            return False

if TEST:
    nor = Team(name='Norway', atts=.2, defs=.1)
    bra = Team('Brazil', atts=.8, defs=.5)
    den = Team('Denmark', atts=.4, defs=.15)
    aus = Team('Australia', atts=.6, defs=.3)
    print(nor)

Norway - atts: 0.2, defs: 0.1


In [123]:
stats = collections.namedtuple('stats', 'team goals_scored goals_admitted points')

In [119]:
class ConstantPar:
    def __init__(self, method='random', russian_home_advantage=True):
        if method == 'random':
            self.team_list, self.intercept, self.home_advantage = self.get_pars_random(russian_home_advantage)
        elif method == 'median':
            self.team_list, self.intercept, self.home_advantage = self.get_pars_median(russian_home_advantage)
        else:
            raise KeyError("Method has to be either 'random' or 'median'")
        

    @staticmethod
    def get_team_idx(team_name):
        teams = pd.read_pickle(Path('../data/teams.pkl'))
        return teams.loc[teams['team'] == team_name, 'team_idx'].values[0]
    
    
    def get_pars_random(self, seed=None, russian_home_advantage=True):
        data = pickle.loads(Path('../data/pymc_trace.pkl').read_bytes())
        model, trace = data['model'], data['trace']
        
        all_teams = json.loads(Path('../data/all_teams.json').read_text())
        
        np.random.seed(seed)
        idx = np.random.choice(trace['intercept'].shape[0])
        intercept = trace['intercept'][idx]
        home_advantage = trace['home'][idx]
        team_list = {}
        for team in all_teams:
            atts = trace['atts'][idx, self.get_team_idx(team)]
            defs = trace['defs'][idx, self.get_team_idx(team)]
            if (team == 'Russia') and russian_home_advantage:
                atts += home_advantage
            team_list[team] = Team(name=team, atts=atts, defs=defs)
        return team_list, intercept, home_advantage
    
    
    def get_pars_median(self, russian_home_advantage=True):
        data = pickle.loads(Path('../data/pymc_trace.pkl').read_bytes())
        model, trace = data['model'], data['trace']
        
        all_teams = json.loads(Path('../data/all_teams.json').read_text())
        
        intercept = pm.stats.quantiles(trace['intercept'])[50]
        home_advantage = pm.stats.quantiles(trace['home'])[50]
        atts = pm.stats.quantiles(trace['atts'])[50]
        defs = pm.stats.quantiles(trace['defs'])[50]
        team_list = {}
        for team in all_teams:
            atts_team = atts[self.get_team_idx(team)]
            defs_team = defs[self.get_team_idx(team)]
            if (team == 'Russia') and russian_home_advantage:
                atts_team += home_advantage
            team_list[team] = Team(name=team, atts=atts_team, defs=defs_team)
        return team_list, intercept, home_advantage
    
        
    
    def __call__(self, home_team, away_team, date=None, can_draw=True):
        home = self.team_list[home_team]
        away = self.team_list[away_team]
        
        home_theta = np.exp(self.intercept + home.atts + away.defs + self.home_advantage )
        away_theta = np.exp(self.intercept + away.atts + home.defs)

        # Ordinary time
        home_goals = np.random.poisson(home_theta)
        away_goals = np.random.poisson(away_theta)

        if (home_goals == away_goals) and (not can_draw): # Assumes golden goal
            to_overtime = True
            first_home_goal = np.random.exponential(1/home_theta)
            first_away_goal = np.random.exponential(1/away_theta)
            if first_home_goal < first_away_goal:
                home_goals += 1
            else:
                away_goals += 1
        else:
            to_overtime = False
        if home_goals > away_goals:
            winner, self = home_team, away_team
        elif away_goals > home_goals:
            winner, self = away_team, home_team
        else:
            winner = None

        return Outcome(home=home_team,
                       away=away_team,
                       home_goals=home_goals,
                       away_goals=away_goals,
                       date=date,
                       can_draw=can_draw,
                       to_overtime=to_overtime)
        
    
if TEST:
    m = ConstantPar()
    o = m('Norway', 'Brazil')
    print(o.stats)
    print(o.away_stats)

{'Norway': 2, 'Brazil': 2, 'stats': {'winner': None, 'date': None, 'can_draw': True, 'to_overtime': False}}
stats(team="'Brazil'", goals_scored=2, goals_admitted=2, points=1)


### Elo-based estimation

In [98]:
class TeamElo:
    """
    Elo based predictions. Based off of
    - Fivethirtyeight: https://github.com/fivethirtyeight/nfl-elo-game/blob/master/forecast.py
    - World Football Elo ratings: https://www.eloratings.net/about
    - This academic prediction paper: http://www.collective-behavior.com/publ/ELO.pdf
    """
    def __init__(self, team):
        self.team = team
        self.elos = []
        self.last_elo = 1500.0

    
    def add_elo(self, date, change):
        self.elos.append({'team': self.team,
                          'date': date,
                          'elo': self.last_elo,
                          'elo_new': self.last_elo + change})
        self.last_elo += change
        
    
    @property
    def df(self):
        return pd.DataFrame(self.elos).sort_values(by='date', ascending=True)
    
    
    def plot(self):
        df = self.df
        ax = df.plot(x='date', y='elo')
        return ax
    
class Elo:
    def __init__(self):
        """
        Class for generating Elo scores and predicting game outcomes. Has to be trained with a games df
        """
        teams = {}
        unique_teams = set(df.home_team.values) | set(df.away_team.values)
        for team in unique_teams:
            teams[team] = TeamElo(team)
            
        self.teams = teams
        self.HFA = 100.0    # Home field advantage 
        self.K = 20.0       # The speed at which Elo ratings change
        
        self.lr = LogisticRegression() # Logistic regression for predicting outcome of games



        
    def train(self, df=None):
        """ Generates win probabilities and estimates Elo scores for each country """
        if df is None:
            df = pd.read_pickle('../data/games_idx.pkl')
    
        games = df.sort_values(by='date', ascending=True).to_dict(orient='index')

        


        for _, game in games.items():
            home_team, away_team = game['home_team'], game['away_team']
            
            out = self.update(date=game['date'],
                              home_team=game['home_team'],
                              away_team=game['away_team'],
                              home_score=game['home_score'],
                              away_score=game['away_score'],
                              neutral=game['neutral']
                             )

            game['home_elo'], game['away_elo'], game['elo_home_win_prob'] = out['home_elo'], out['away_elo'], out['p_home']

        out = pd.DataFrame(games).T
        out['draw'] = 1-out['home_win'] - out['away_win']
        X = (out['home_elo'] - out['away_elo']).values
        y = out[['home_win', 'draw', 'away_win']].astype(int).values
        
        self.lr.fit(X.reshape(-1, 1), np.argwhere(y==1)[:,1])

        return out
    
    def __call__(self, home_team, away_team, can_draw=True):
        out = self.predict_update(home_team, away_team, can_draw=can_draw)
        return Outcome(home_team, away_team, out['home_score'], out['away_score'], can_draw, out['to_overtime'])
        
    
    def predict_update(self, home_team, away_team, date=None, neutral=True, can_draw=True):
        winner, to_overtime = self.pred(home_team, away_team, neutral, can_draw)
        if winner == home_team:
            home_score, away_score = 1, 0
        elif winner == away_team:
            home_score, away_score = 0, 1
        else:
            home_score, away_score = 0, 0
        
        out = self.update(date, home_team, away_team, home_score, away_score, neutral)
        return {**out, **{'home_score': home_score, 'away_score': away_score, 'to_overtime': to_overtime}}
    
    
    def update(self, date, home_team, away_team, home_score, away_score, neutral):
        home_elo, away_elo = self.teams[home_team].last_elo, self.teams[away_team].last_elo
        p_home = self._predict_from_elo(home_elo, away_elo, neutral)
        shift = self.elo_change(home_score - away_score, p_home)

        self.teams[home_team].add_elo(date, shift)
        self.teams[away_team].add_elo(date, -shift)
        
        return {'home_elo': home_elo, 'away_elo': away_elo, 'p_home': p_home}
        
    

    def pred(self, home_team, away_team, neutral=True, can_draw=True, to_overtime=False):
        home_elo, away_elo = self.teams[home_team].last_elo, self.teams[away_team].last_elo
        ar = self.lr.predict_proba(home_elo - away_elo)
        outcome = [home_team, 'draw', away_team][np.argmax(np.random.random() < ar.cumsum())]
        if outcome == 'draw' and can_draw == False:
            return self.pred(home_team, away_team, neutral, can_draw, to_overtime=True)
        return outcome, to_overtime        

    
    @property
    def teams_df(self):
        teams = (pd
                 .DataFrame
                 .from_records([val for team in teams.values() for val in self.team.elos])
                 .sort_values(by=['team', 'date'], ascending=[True, True])
                 .reset_index()
                )
        return teams

    
    def _predict_from_elo(self, home_elo, away_elo, neutral=True):
        """
        Given two teams, will predict probability for home win
        """
        elo_diff = home_elo - away_elo + (0 if neutral == 1 else self.HFA)

        p_home = 1.0 / (math.pow(10.0, (-elo_diff/400.0)) + 1.0)
        
        return p_home
    

    def elo_change(self, score_diff, estimated_p_home):
        if abs(score_diff) == 1:
            mult = 1
        elif abs(score_diff) == 2:
            mult = 1.5
        elif abs(score_diff) == 3:
            mult = 1.75
        else:
            mult = 1+ (abs(score_diff)-3)/8

        if score_diff == 0:
            result = .5
        elif score_diff > 0:
            result = 1
        else:
            result = 0

        # Elo shift based on K and the margin of victory multiplier
        shift = (self.K * mult) * (result - estimated_p_home)
        
        return shift





In [99]:
elo = Elo()
elo_teams = elo.train()
Path('../data/elo_pickle.pkl').write_bytes(pickle.dumps(elo))

4257828

In [100]:
elo = pickle.loads(Path('../data/elo_pickle.pkl').read_bytes())

In [106]:
sim = Simulation([elo for _ in range(100)], n=1)
sim.group

Unnamed: 0,country,group,mean,std,min,25%,50%,75%,max,share wins,share top 4,share playoff
0,Egypt,A,12.87,5.16721,2,8,16,16,16,0%,20%,100%
1,Russia,A,15.36,4.31937,4,16,16,16,24,0%,2%,90%
2,Saudi Arabia,A,23.2,2.41209,16,24,24,24,24,0%,0%,10%
3,Uruguay,A,24.0,0.0,24,24,24,24,24,0%,0%,0%
4,Iran,B,8.99,3.9119,1,8,8,8,24,1%,11%,99%
5,Morocco,B,9.64,7.38347,2,4,8,16,24,0%,46%,86%
6,Portugal,B,22.76,4.12707,4,24,24,24,24,0%,3%,10%
7,Spain,B,23.52,2.22238,8,24,24,24,24,0%,0%,5%
8,Australia,C,15.32,3.98198,4,16,16,16,24,0%,1%,92%
9,Denmark,C,18.56,4.0808,8,16,16,24,24,0%,0%,66%


In [107]:
sim.style

Unnamed: 0,country,group,mean,std,min,25%,50%,75%,max,share wins,share top 4,share playoff
0,Argentina,D,1.22,1.05006,1,1,1,1,8,93%,98%,100%
1,England,G,5.97,6.47521,1,2,3,5,24,4%,75%,94%
2,Belgium,G,5.2,6.06197,1,2,2,3,24,2%,79%,96%
3,Iran,B,8.99,3.9119,1,8,8,8,24,1%,11%,99%
4,Poland,H,22.84,3.42516,4,24,24,24,24,0%,1%,12%
5,Portugal,B,22.76,4.12707,4,24,24,24,24,0%,3%,10%
6,Russia,A,15.36,4.31937,4,16,16,16,24,0%,2%,90%
7,Saudi Arabia,A,23.2,2.41209,16,24,24,24,24,0%,0%,10%
8,Senegal,H,24.0,0.0,24,24,24,24,24,0%,0%,0%
9,Serbia,E,22.48,4.20793,8,24,24,24,24,0%,0%,13%


In [65]:
sim = Simulation([ConstantPar() for _ in range(5)], n=100)
sim.group

Unnamed: 0,country,group,mean,std,min,25%,50%,75%,max,share wins,share top 4,share playoff
0,Russia,A,9.084,7.48284,1,2,8,16,24,14%,47%,91%
1,Uruguay,A,16.716,8.15895,1,8,16,24,24,4%,14%,52%
2,Egypt,A,19.086,6.35479,1,16,24,24,24,0%,4%,44%
3,Saudi Arabia,A,22.446,4.35258,1,24,24,24,24,0%,1%,13%
4,Spain,B,14.126,8.63431,1,8,16,24,24,7%,24%,66%
5,Portugal,B,16.932,7.81803,1,8,16,24,24,3%,14%,53%
6,Iran,B,18.732,6.2246,1,16,24,24,24,0%,5%,49%
7,Morocco,B,20.746,5.30261,2,16,24,24,24,0%,2%,32%
8,Denmark,C,14.486,7.87997,1,8,16,24,24,4%,18%,69%
9,France,C,15.256,8.21739,1,8,16,24,24,4%,19%,62%


In [66]:
sim.style

Unnamed: 0,country,group,mean,std,min,25%,50%,75%,max,share wins,share top 4,share playoff
0,Argentina,D,8.68,6.92514,1,2,8,16,24,20%,41%,95%
1,England,G,6.71,6.35663,1,2,3,8,24,19%,61%,97%
2,Russia,A,9.084,7.48284,1,2,8,16,24,14%,47%,91%
3,Germany,F,10.528,7.10818,1,3,8,16,24,10%,29%,90%
4,Spain,B,14.126,8.63431,1,8,16,24,24,7%,24%,66%
5,France,C,15.256,8.21739,1,8,16,24,24,4%,19%,62%
6,Uruguay,A,16.716,8.15895,1,8,16,24,24,4%,14%,52%
7,Denmark,C,14.486,7.87997,1,8,16,24,24,4%,18%,69%
8,Belgium,G,10.782,6.92346,1,4,8,16,24,4%,30%,90%
9,Croatia,D,13.186,7.32931,1,8,16,16,24,3%,13%,77%
