In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lightgbm as lgbm

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import joblib

import re 

import datetime 

plt.style.use("fivethirtyeight")
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def get_intermediate(df):
    global _df
    _df = df
    return df

In [3]:
!ls ../../datasets/

elo_rating.csv				 fifa_world_cup_2022
elo_rating_preprocessed.csv		 international_football_results
fifa_ranking_before_wc.csv		 predictions
fifa_ranking_before_wc_preprocessed.csv


## Modelagem

Considerações:
- Usamos dados de 2018 em diante para validação
- Apesar da pandemia, houve jogos, inclusive de qualificação da FIFA.
- Só podemos usar features que conseguimos estimar com boa confiança para as partidas da Copa de 2022. Isso vai guiar algumas decisões de ficar ou não com certas features. Para as métricas de times como defesa, ataque e goleiro, vamos manter o último dado disponível
- removemos torneios com menos de 10 partidas

## Usando `fifa_world_cup_2022`

In [4]:
def get_2022_players():
    df = pd.read_csv("../../datasets/international_football_results/results_preprocessed.csv")
    players = (
        df[
            (df.tournament == 'FIFA World Cup')
            & (df.year == 2022)
        ]
        [['home_team', 'away_team']]
        .to_numpy()
        .flatten()
    )
    return np.unique(players)

players2022 = get_2022_players()
print("# Players: ", len(players2022))
players2022

# Players:  32


array(['Argentina', 'Australia', 'Belgium', 'Brazil', 'Cameroon',
       'Canada', 'Costa Rica', 'Croatia', 'Denmark', 'Ecuador', 'England',
       'France', 'Germany', 'Ghana', 'Iran', 'Japan', 'Mexico', 'Morocco',
       'Netherlands', 'Poland', 'Portugal', 'Qatar', 'Saudi Arabia',
       'Senegal', 'Serbia', 'South Korea', 'Spain', 'Switzerland',
       'Tunisia', 'United States', 'Uruguay', 'Wales'], dtype=object)

In [5]:
def merge_names(df):
    mapper = {
        'IR Iran': 'Iran',
        'USA': 'United States',
        'Korea Republic': 'South Korea'
    }
    for col in ['home_team', 'away_team']:
        df[col] = df[col].apply(lambda x: mapper.get(x, x))
    return df
    

df_fifa_world_cup_2022 = (
    pd.read_csv('../../datasets/fifa_world_cup_2022/fifa_world_cup_2022.csv')
    .pipe(merge_names)
    .pipe(get_intermediate)
    [(_df.home_team.isin(players2022)) | (_df.away_team.isin(players2022))]
)


df_fifa_world_cup_2022.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,No,Win,,,,,,,,
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,No,Draw,,,,,,,,
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,No,Win,,,,,,,,
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,No,Lose,,,,,,,,
9,1993-08-11,Sweden,Switzerland,Europe,Europe,4,3,0,0,1,...,No,Lose,,,,,,,,


#### Filtra apenas times da copa

In [6]:
all_players = set(df_fifa_world_cup_2022.home_team.to_list() + df_fifa_world_cup_2022.away_team.to_list())

assert set(players2022).issubset(all_players), [t for t in players2022 if t not in all_players]

In [7]:
df_fifa_world_cup_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9321 entries, 0 to 23920
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   date                           9321 non-null   object 
 1   home_team                      9321 non-null   object 
 2   away_team                      9321 non-null   object 
 3   home_team_continent            9321 non-null   object 
 4   away_team_continent            9321 non-null   object 
 5   home_team_fifa_rank            9321 non-null   int64  
 6   away_team_fifa_rank            9321 non-null   int64  
 7   home_team_total_fifa_points    9321 non-null   int64  
 8   away_team_total_fifa_points    9321 non-null   int64  
 9   home_team_score                9321 non-null   int64  
 10  away_team_score                9321 non-null   int64  
 11  tournament                     9321 non-null   object 
 12  city                           9321 non-null   

## Preprocess

In [20]:
elegible_tournaments = (
    df_fifa_world_cup_2022
    .groupby('tournament')
    .agg({
        'home_team': 'count',
        'date': ['min', 'max'],
    })
    .sort_values(by=[('home_team', 'count')], ascending=False)
    .reset_index()
    .pipe(get_intermediate)
    [_df[('home_team', 'count')] > 100]
)

elegible_tournaments

Unnamed: 0_level_0,tournament,home_team,date,date
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max
0,Friendly,3587,1993-08-08,2022-06-14
1,FIFA World Cup qualification,2349,1993-08-08,2022-06-14
2,UEFA Euro qualification,708,1994-09-04,2020-11-12
3,FIFA World Cup,384,1994-06-17,2018-07-15
4,African Cup of Nations qualification,250,1994-09-04,2022-06-13
5,African Cup of Nations,246,1994-03-26,2022-02-06
6,Copa América,225,1995-07-05,2021-07-10
7,UEFA Euro,209,1996-06-08,2021-07-11
8,Gold Cup,203,1996-01-10,2021-08-01
9,AFC Asian Cup,155,1996-12-04,2019-02-01


In [60]:
features_cols = [
    'home_team',
    'away_team',
    'home_team_fifa_rank',
    'away_team_fifa_rank',
    'home_team_total_fifa_points',
    'away_team_total_fifa_points',
#     'home_team_score',
#     'away_team_score',
    'tournament',
    'neutral_location',
    'home_team_goalkeeper_score',
    'away_team_goalkeeper_score',
    'home_team_mean_defense_score',
    'home_team_mean_offense_score',
    'home_team_mean_midfield_score',
    'away_team_mean_defense_score',
    'away_team_mean_offense_score',
    'away_team_mean_midfield_score'
]

target_col = ['home_team_result']

def make_label(x):
    if x == 'Win':
        return 1
    elif x == 'Lose':
        return -1 
    elif x == 'Draw': 
        return 0
    
def add_features(df):
    df['home2away_fifa_point_diff'] = df.home_team_total_fifa_points - df.away_team_total_fifa_points
    df['home2away_fifa_point_pdiff'] = df.home_team_total_fifa_points/df.away_team_total_fifa_points - 1
    
    return df
    
def preprocess_fifa_world_cup_2022(
    df, 
    elegible_tournaments=elegible_tournaments,
):
    df = df.copy()
    df = df[df.tournament.isin(elegible_tournaments.tournament)]
    df = (
        df
        .assign(date = pd.to_datetime(df.date))
        .pipe(get_intermediate)
        .assign(year = _df.date.dt.year)
        .assign(neutral_location = _df.neutral_location.astype('bool'))
        .set_index('date')
        .pipe(add_features)
        
    )
    df['home_team_result'] = df['home_team_result'].apply(lambda x: make_label(x))
    df = df[[c for c in df.columns if c!='home_team_result'] + ['home_team_result']]
        
    
    return df

In [76]:
X = preprocess_fifa_world_cup_2022(df_fifa_world_cup_2022)

X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8799 entries, 1993-08-08 to 2022-06-14
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   home_team                      8799 non-null   object 
 1   away_team                      8799 non-null   object 
 2   home_team_continent            8799 non-null   object 
 3   away_team_continent            8799 non-null   object 
 4   home_team_fifa_rank            8799 non-null   int64  
 5   away_team_fifa_rank            8799 non-null   int64  
 6   home_team_total_fifa_points    8799 non-null   int64  
 7   away_team_total_fifa_points    8799 non-null   int64  
 8   home_team_score                8799 non-null   int64  
 9   away_team_score                8799 non-null   int64  
 10  tournament                     8799 non-null   object 
 11  city                           8799 non-null   object 
 12  country                       

In [128]:
def get_rolling_feature(X, cols, aggregation):
    return (
        X
        .rolling(7, min_periods=1)
        .agg({col: aggregation for col in cols})
    )

def normalize_df(df, side, team):
    l = []
    if side == 'away':
        df['away_team_result'] = np.sign(-1 * df['home_team_result'])
        l = ['home2away_fifa_point_diff', 'home2away_fifa_point_pdiff']
    side_cols = [col for col in df.columns if f'{side}_' in col and col not in l]
    df_side = (
        df
        [df[f'{side}_team'] == team]
        [side_cols]
        .rename(columns={col: re.sub(f'{side}_', '', col) for col in side_cols})
    )
    return df_side
    

def normalize_matches(df, team):
    """
    one row per team and date of match.
    """
    df = df.copy()
    df_home = normalize_df(df, 'home', team)
    df_away = normalize_df(df, 'away', team)
    df_norm = pd.concat([df_home, df_away], axis=0).sort_index()

    _df = get_rolling_feature(df_norm, ['team_result'], 'sum')
    df_norm['team_results_mm7'] = _df
    
    return df_norm

def get_features_per_team(df, teams):
    df_final = pd.DataFrame()
    for team in teams:
        df_norm = normalize_matches(df, team)
        if not df_final.empty:
            df_final = pd.concat([df_final, df_norm],axis=0)
        else:
            df_final = df_norm
    return df_final

In [129]:
df_norm_features = get_features_per_team(X, players2022)
df_norm_features.head()

Unnamed: 0_level_0,team,team_continent,team_fifa_rank,team_total_fifa_points,team_score,team_goalkeeper_score,team_mean_defense_score,team_mean_offense_score,team_mean_midfield_score,team_result,team_results_mm7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-08-08,Argentina,South America,5,0,3,,,,,1,1.0
1993-08-15,Argentina,South America,5,0,1,,,,,-1,0.0
1993-08-22,Argentina,South America,5,0,2,,,,,1,1.0
1993-08-29,Argentina,South America,5,0,0,,,,,0,1.0
1993-09-05,Argentina,South America,5,0,0,,,,,-1,0.0


In [130]:
df_norm_features[df_norm_features.team == 'Brazil']

Unnamed: 0_level_0,team,team_continent,team_fifa_rank,team_total_fifa_points,team_score,team_goalkeeper_score,team_mean_defense_score,team_mean_offense_score,team_mean_midfield_score,team_result,team_results_mm7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-08-08,Brazil,South America,8,0,1,,,,,0,0.0
1993-08-15,Brazil,South America,8,0,1,,,,,0,0.0
1993-08-22,Brazil,South America,8,0,2,,,,,1,1.0
1993-08-29,Brazil,South America,8,0,6,,,,,1,2.0
1993-09-05,Brazil,South America,8,0,4,,,,,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...
2022-02-01,Brazil,South America,2,1826,4,89.0,84.8,86.3,85.5,1,4.0
2022-03-24,Brazil,South America,2,1823,4,89.0,84.8,86.3,85.5,1,4.0
2022-03-29,Brazil,South America,2,1823,4,89.0,84.8,86.3,85.5,1,5.0
2022-06-02,Brazil,South America,1,1832,5,89.0,84.8,86.3,85.5,1,5.0


In [131]:
X[X.home_team == 'Brazil'].shape

(230, 27)

In [94]:
(
    X[X.home_team == 'Brazil']
#     .groupby('home_team', sort=False)
    ['home_team_result']
    .rolling(7)
    .sum()
    .rename('victories')
#     .shape
)

date
1993-08-08    NaN
1993-08-22    NaN
1993-08-29    NaN
1993-09-05    NaN
1993-09-19    NaN
             ... 
2021-09-09    4.0
2021-10-14    4.0
2021-11-11    4.0
2022-02-01    5.0
2022-03-24    5.0
Name: victories, Length: 230, dtype: float64

In [40]:
X.groupby('home_team', sort=False)['home_team_result'].rolling(7).sum().shape

(8799,)

In [26]:
X.groupby('home_team', sort=False)['home_team_result'].rolling(7).sum()

home_team       date      
Bolivia         1993-08-08   NaN
                1993-08-15   NaN
                1994-05-04   NaN
                1994-05-11   NaN
                1994-06-11   NaN
                              ..
Gibraltar       2021-03-30   NaN
Curaçao         2017-07-16   NaN
Chinese Taipei  2019-10-15   NaN
Cayman Islands  2021-03-29   NaN
Aruba           2021-06-05   NaN
Name: home_team_result, Length: 8799, dtype: float64

In [41]:
X.shape

(8799, 27)

## Brief EDA

In [None]:
X['2000':].home_team_total_fifa_points.hist()

In [None]:
X[X.home2away_fifa_point_pdiff.isnull()][:'2011'][['home_team','away_team','home_team_total_fifa_points', 'away_team_total_fifa_points']]

In [None]:
(
    X
    .assign(is_zero_home = X.home_team_total_fifa_points == 0)
    .assign(is_zero_away = X.away_team_total_fifa_points == 0)
    [X.home2away_fifa_point_pdiff.isnull()].groupby(['year', 'is_zero_home', 'is_zero_away'])
    ['home_team']
    .count()
)

In [None]:
X['home_team_result'].hist()

In [None]:
def plot_timeline(X, cutoff_time='2018-01-01'):
    _d = (pd.to_datetime(cutoff_time) + datetime.timedelta(days=365)).strftime('%Y-%m-%d')
    train = X[:_d]
    test = X[cutoff_time:]
    f, ax = plt.subplots()
    sns.lineplot(data=train, x='year', y='home_team_result', label='Train', ax=ax)
    sns.lineplot(data=test, x='year', y='home_team_result', label='Test', ax=ax)
    ax.set_ylabel("Matches")
    ax.set_title("Avg Home Team Result per year")
    
    return train, test
    
x1, x2 = plot_timeline(X)

In [None]:
X.groupby('year')['home_team_result'].count().plot()
plt.title("Matches per year")

In [None]:
X[X.home_team_score < 10]['home_team_score'].hist(bins=50)

In [None]:
X.year.unique()

In [None]:
# jogos na pandemia
X[X.year == 2020].tournament.unique()

In [None]:
f, ax = plt.subplots() 

sns.histplot(data=X['home_team_score'],label='home',ax=ax)
sns.histplot(data=X['away_team_score'],label='away',ax=ax);

# ax.legend(['home', 'away'])
ax.legend()

In [None]:
X['home_team_score'].mean(), X['away_team_score'].mean()

## Split Data



In [None]:
def split_dataset(
    df,
    cutoff_time='2018-05-14',
    min_date=None,
    features_cols=features_cols,
    target=target_col,
):
    
    df = df.copy()
    if min_date:
        df = df[min_date:]
    
    t0 = (pd.to_datetime(cutoff_time) - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    df_train = df[:t0]
    df_test = df[cutoff_time:]
    
    X_train, y_train = df_train[features_cols], df_train[target].to_numpy().ravel()
    X_test, y_test = df_test[features_cols], df_test[target].to_numpy().ravel()
    
    assert set(X_train.index).isdisjoint(X_test.index)
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = split_dataset(X, min_date='2005-01-01')

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_test.shape[0] / X.shape[0]

In [None]:
X_train.head()

### LGBM

In [None]:
cat_features = ['home_team', 'away_team', 'tournament', 'neutral_location']

def make_categories(df, cat_features=cat_features):
    for c in cat_features:
        df[c] = df[c].astype('category')
    return df

X_train_lgbm = make_categories(X_train)
X_test_lgbm = make_categories(X_test)

In [None]:
model_lgbm = lgbm.LGBMClassifier()
model_lgbm.fit(X_train_lgbm, y_train, categorical_feature=cat_features)

In [None]:
y_train_pred = model_lgbm.predict(X_train_lgbm)
y_test_pred = model_lgbm.predict(X_test_lgbm)

In [None]:
print(classification_report(y_true=y_train, y_pred=y_train_pred))

In [None]:
print(classification_report(y_true=y_test, y_pred=y_test_pred))

In [None]:
lgbm.plot_importance(model_lgbm)

In [None]:
plt.style.use('classic')
plot_confusion_matrix(model_lgbm, X_test_lgbm, y_test)
plt.style.use("fivethirtyeight")

In [None]:
import time

ts = datetime.datetime.utcfromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M')
ts

In [None]:
# save model

joblib.dump(model_lgbm, f'lgbm_model_{ts}.pkl')
# load model
# gbm_pickle = joblib.load('lgbm_model.pkl')

In [None]:
!ls

## Prepare prediction dataset

Para preparar o dataset de predição, precisamos coletar as informações mais recentes de cada time para as próximas partidas reais, o que independe se estavam na coluna 'home' ou 'away'.

In [None]:
feature_metrics = [
    c for c in X.columns if re.search(r'(home|away)*?(score|points|rank)', c) 
    and not re.search(r'(home|away)_team_score', c)
]
feature_metrics

In [None]:
def get_recent_feature_one_player(df, player, team_side):
    features = [f'{team_side}_team'] + [c for c in feature_metrics if team_side in c]
    print(features)
    _df = (
        df
        [(df[f'{team_side}_team'] == player)]
        [features]
        .fillna(method='ffill')
        .sort_index()
        .tail(1)
        .rename(columns={c: re.sub(f'{team_side}_', '', c) for c in features})
    )
    return _df

def get_recent_features(df, players2022=players2022):
    global feature_metrics
    
    df = df.copy().sort_index()
    i = 0
    for player in players2022:
        # get most recent data for player home or away
        df_home = get_recent_feature_one_player(df, player, 'home')
        df_away = get_recent_feature_one_player(df, player, 'away')
        _df = pd.concat([df_home, df_away], axis=0).sort_index().tail(1)
            
        if _df.empty:
            raise ValueError(f"i={i}: Data for team '{player}' is empty.")
        if i == 0:
            df_recent = _df
        else:
            df_recent = pd.concat([df_recent, _df], axis=0)
        i += 1

    df_recent = (
        df_recent
        .sort_index()
        .drop_duplicates(subset=['team'], keep='last')
    )
    
    return df_recent

In [None]:
df_recent = get_recent_features(X)
df_recent.head()

In [None]:
df_recent[df_recent.duplicated(subset=['team'])]

In [None]:
df_recent.isnull().sum()

In [None]:
df_recent[df_recent.team_goalkeeper_score.isnull()]

In [None]:
get_recent_feature_one_player(X, 'Qatar', 'home')

In [None]:
team = 'Qatar'
# X['2005':].query(f"home_team == '{team}' or away_team == '{team}'")
X['2005':].query(f"home_team == '{team}'")[[c for c in feature_metrics if 'home' in c]]

In [None]:
set(players2022).issubset( set(df_recent.team))

In [None]:
df_recent.index

In [None]:
def get_quatar_matches():
    df = (
        pd.read_csv("../../datasets/international_football_results/results.csv")
        .pipe(get_intermediate)
        .assign(date = pd.to_datetime(_df.date))
        .pipe(get_intermediate)
        .assign(year = _df.date.dt.year)
        .pipe(get_intermediate)
        [
            (_df.home_score.isnull()) # NaN values for matches are for this 48 games 2022 year
            & (_df.tournament == 'FIFA World Cup')
            & (_df.year == 2022)
        ]
        .drop(columns=['home_score', 'away_score'])
        .rename(columns={'neutral': 'neutral_location'})
        .set_index('date')
    )
    print(f"Loaded {df.shape[0]} Matches for 2022 Cup.")
    return df
    

quatar_matches = get_quatar_matches()
quatar_matches

In [None]:
df_recent.head()

In [None]:
df_recent.columns

In [None]:
def merge_features(df_prediction, df_recent, players):
    """
    For each player, we get most recent features.
    """
    df_final = df_prediction.sort_index().copy()
    df_recent = df_recent.sort_index()
    
    # merge each case
    df_final = pd.merge_asof(
        left=df_final,
        right=df_recent.rename(columns={c: f'home_{c}' for c in df_recent.columns}),
        left_index=True,
        right_index=True,
        left_by='home_team',
        right_by='home_team'
    )
    df_final = pd.merge_asof(
        left=df_final,
        right=df_recent.rename(columns={c: f'away_{c}' for c in df_recent.columns}),
        left_index=True,
        right_index=True,
        left_by='away_team',
        right_by='away_team'
    )
        
    # Assert we got all games
    assert df_prediction.shape[0] == df_final.shape[0]
        
    return df_final

In [None]:
df_prediction = merge_features(quatar_matches, df_recent, players2022)
df_prediction

In [None]:
df_prediction.to_csv('../../datasets/predictions/quatar2022_features.csv')

## Simular a Copa 2022

Agora, vamos simular os resultados da copa 2022 usando o modelo. Para isso, vamos criar uma classe que receberá como entrada o dataframe com as features mais recentes dos times da copa, um modelo treinado, e a informação das chaves dos grupos. 

Antes de prever os gols dos times com técnicas mais complexas, vamos assumir que, quando for previsto empate, vence quem tem mais rating da FIFA. Depois, vamos montar simulações usando a previsão dos gols com distribuição de Poisson.

In [None]:

# min_date='2005-01-01'

In [None]:
X_train_lgbm.info()

In [None]:
np.concatenate((y_train, y_test))

In [None]:
X_train_full = make_categories( pd.concat([X_train_lgbm, X_test_lgbm]))
y_train_full = np.concatenate((y_train, y_test))

prediction_model = lgbm.LGBMClassifier()
prediction_model.fit(
    X_train_full, 
    y_train_full, 
    categorical_feature=cat_features
)

In [None]:
# save prediction model
ts = datetime.datetime.utcfromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M')
joblib.dump(prediction_model, f'predictions/lgbm_model_prediction_{ts}.pkl')

In [None]:
class GroupMatchesSimulator():
    """
    Classe para simular as partidas da Copa de 2022.
    """
    def __init__(self, df, model, features, target_name='home_team_result'):
        self.df = df
        self.model = model
        self.features = features
        self.target_name = target_name
        
    def run_simulation(self):
        _X_pred = make_categories(self.df[self.features]) # lgbm needs 'objects' string to be 'category'
        y_pred = self.model.predict(_X_pred)
        self.df[self.target_name] = y_pred

In [None]:
simulator = GroupMatchesSimulator(df_prediction, prediction_model, features=X_train_full.columns)

In [None]:
simulator.run_simulation()

In [None]:
simulator.df

## Simular o número de gols na Copa 2022

In [None]:
from empiricaldist import Pmf
from scipy.stats import poisson

def make_poisson_pmf(lam, qs):
    """Make a Pmf of a Poisson distribution."""
    ps = poisson(lam).pmf(qs)
    pmf = Pmf(ps, qs)
    pmf.normalize()
    return pmf

In [None]:
import numpy as np

lam = 1.4
goals = np.arange(10)
pmf_goals = make_poisson_pmf(lam, goals)

pmf_goals.bar(label=r'Poisson distribution with $\lambda=1.4$')

In [None]:
X['home_team_score'].head().to_list()

In [None]:
def estimate_goals(X, teams):
    """
    Gera uma distribuição de Poisson para cada time.
    """
    dists = {}
    for team in teams:
        goals_home = X[(X.home_team == team)]['home_team_score'].to_list()
        goals_away = X[(X.away_team == team)]['away_team_score'].to_list()
        goals = np.array(goals_home + goals_away)
        lam = goals.mean()
        print(f"Team {team} lambda={lam}")
        
        goals_range = np.arange(10)
        pmf_goals = make_poisson_pmf(lam, goals_range)
        dists[team] = (lam, pmf_goals)
    
    return dists

def plot_poisson_dist(dists, team):
    lam, pmf_goals = dists[team]
    pmf_goals.bar()
    plt.title(f"Poisson distribution for team '{team}' and $\lambda={round(lam,2)}$")
    
def predict_goals(dists, team):
    lam, pmf_goals = dists[team]
    return np.argmax(pmf_goals)

In [None]:
possion_dists = estimate_goals(X['2005-01-01':], players2022)

In [None]:
plot_poisson_dist(possion_dists, 'Iran')

In [None]:
plot_poisson_dist(possion_dists, 'England')

In [None]:
# likely goals 
print(predict_goals(possion_dists, 'Brazil'))
print(predict_goals(possion_dists, 'England'))
print(predict_goals(possion_dists, 'Senegal'))

### Adicionar número de gols nas previsões

In [None]:
def print_placar_final():
    pass

In [None]:
df_prediction[['home_team', 'away_team', 'home_team_result']]

In [None]:
def add_goals_to_predictions(df, poisson_dists):
    df_goals = df_prediction[['home_team', 'away_team', 'home_team_result']]
    
    df_goals['home_score'] = df_goals['home_team'].apply(lambda x: predict_goals(possion_dists, x))
    df_goals['away_score'] = df_goals['away_team'].apply(lambda x: predict_goals(possion_dists, x))
    df_goals['home_results_by_goals'] = np.sign((df_goals['home_score'] - df_goals['away_score']))
    df_goals['are_both_preds_equal'] = df_goals['home_results_by_goals'] == df_goals['home_team_result']
    
    return df_goals

In [None]:
df_prediction_with_goals = add_goals_to_predictions(df_prediction, possion_dists)

df_prediction_with_goals

In [None]:
df_prediction_with_goals[~df_prediction_with_goals.are_both_preds_equal]

## Preparar submissão

In [None]:
!head -n 10 ../../../sigmageek/submission.csv

In [None]:
!head -n 10 ../../../sigmageek/results.csv

In [None]:
with open('../../../sigmageek/results.csv', 'r') as f:
    lines = f.readlines()

games_order = []
for line in lines:
    team1 = re.findall('[\w]+', line)[0]
    team2 = re.findall('[\w]+', line)[2]
    games_order.append((team1, team2))
    
games_order[:10]

In [None]:
games_order

In [None]:
def merge_names2(df):
    mapper = {
        'IR Iran': 'Iran',
        'USA': 'United States',
        'Korea Republic': 'South Korea'
    }
    df['Team'] = df['Team'].apply(lambda x: mapper.get(x, x))
    return df

mapper_siglas = (
    pd.read_csv('../../datasets/fifa_ranking_before_wc_preprocessed.csv')
    .pipe(merge_names2)
    [['code', 'Team']]
    .set_index('Team')
    .to_dict()
    ['code']
)

def make_final_score(row, treat_special_case=True):
    # tratar caso especial de chave (QAT, NED), 2022-11-29, que deveria ser (NED, QAT)
    if treat_special_case and row.home_code == 'QAT' and row.away_code == 'NED':
        print("Treating special case.")
        v = row.home_score
        row.home_score = row.away_score
        row.away_score = v
        # row.are_both_preds_equal does not change
        # home_team_result inverse
        row.home_team_result = - row.home_team_result
        row.home_results_by_goals = - row.home_results_by_goals
        
    # if 'are_both_preds_equal' is True, we just return the gols predicted
    if row.are_both_preds_equal:
        h = int(row.home_score)
        a = int(row.away_score)
        return str(h) + 'x' + str(a)
    else:
        # in this case, we keep the LGBM prediction and have two cases:
        # lgbm model said draw, goals not. We can keep the smallest goal predicted
        # lgbm model said some winner, goals said draw or inverse result. Here, we increase
        # one by one goal from model's winner to achieve the model's prediction
        
        # lgbm says draw
        if row.home_team_result == 0:
            smallest = int(min(row.home_score, row.away_score))
            return str(smallest) + 'x' + str(smallest)
        # lgbm said some winner
        else:
            # goals said draw
            if row.home_results_by_goals == 0:
                # give one more goal to lgbm's winner or remove 1 from loser if possible
                # and random
                if np.random.random() <= 0.5:
                    # winner takes
                    if row.home_team_result == 1:
                        h = int(row.home_score + 1)
                        a = int(row.away_score)
                    else:
                        h = int(row.home_score)
                        a = int(row.away_score + 1)
                    return str(h) + 'x' + str(a)
                else:
                    # loser gets one less goal
                    if row.home_team_result == 1:
                        h = int(row.home_score)
                        a = int(row.away_score - 1)
                    else:
                        h = int(row.home_score - 1)
                        a = int(row.away_score)
                    return str(h) + 'x' + str(a)
                    
            else:
                # goals said some winner different from lgbm
                raise ValueError("Caso não tratato.")
            

def prepare_submission(df, treat_special_case=True):

    df_submission = df.copy()

    df_submission['home_code'] = df_submission.home_team.apply(lambda x: mapper_siglas[x])
    df_submission['away_code'] = df_submission.away_team.apply(lambda x: mapper_siglas[x])
    df_submission['key'] = df_submission.apply(
        lambda row: 
        (row.home_code, row.away_code) 
        if (row.home_code, row.away_code) != ('QAT', 'NED')
        else ('NED', 'QAT')
        , axis=1)
    df_submission = (
        df_submission
        .reset_index()
        .set_index('key')
        .reindex(games_order)
    )
    
    df_submission['final_score'] = df_submission.apply(lambda x: make_final_score(x, treat_special_case), axis=1)
    
    return df_submission

In [None]:
df_submission = prepare_submission(df_prediction_with_goals)

df_submission

In [None]:
df_submission.drop(columns=['home_code', 'away_code'])

In [None]:
df_submission[['final_score']].to_csv(f"predictions/submission.csv", index=False, header=False)

In [None]:
df_submission.shape

In [None]:
!pwd

In [None]:
!ls predictions

In [None]:
!head -n 20 predictions/submission.csv

### Final Results Group Matches

Vamos gerar os mesmos resultados, mas sem tratar um caso especial que só foi feito para a competição da sigmageek.

In [None]:
df_groupmatches_results = prepare_submission(df_prediction_with_goals, treat_special_case=False)

df_groupmatches_results['home_final_score'] = df_groupmatches_results['final_score'].apply(lambda x: x.split('x')[0])
df_groupmatches_results['away_final_score'] = df_groupmatches_results['final_score'].apply(lambda x: x.split('x')[1])

df_groupmatches_results

In [None]:
df_groupmatches_results[['date', 'home_team', 'away_team', 'home_team_result', 'home_score','away_score', 'home_results_by_goals', 'are_both_preds_equal', 'final_score']]

In [None]:
df_groupmatches_results.to_csv(f"predictions/submission_full_columns.csv")

In [None]:
def get_groups(df):
    keys = list(df.index)
    groups = {
        'A': ['QAT'],
        'B': ['ENG'],
        'C': ['ARG'],
        'D': ['FRA'],
        'E': ['ESP'],
        'F': ['BEL'],
        'G': ['BRA'],
        'H': ['POR']
    }
    team2group = {v[0]: k for k,v in groups.items()}
    for group, teams in groups.items():
        team1 = teams[0]
        for key in keys:
            if key[0] == team1:
                v = key[1]
            elif key[1] == team1:
                v = key[0]
            else:
                continue
            if v not in groups[group]:
                groups[group].append(v)
                team2group[v] = group
        
    return groups, team2group

            
group2teams, teams2groups = get_groups(df_groupmatches_results)
group2teams, teams2groups

In [None]:
def final_groupmatches_results(results, players2022, teams2groups, filter_winners=False):
    
    players_results = pd.DataFrame(data={'team': [], 'points': [], 'group': []})
    for player in players2022:
        acr = results[results.home_team == player].head(1)['home_code'].values[0] 
        points_home = (
            (results[results.home_team == player].home_team_result > 0) * 3 
            + (results[results.home_team == player].home_team_result == 0) * 1
        ).sum()
        points_away = (
            (results[results.away_team == player].home_team_result < 0) * 3 
            + (results[results.away_team == player].home_team_result == 0) * 1
        ).sum()
        points = points_home + points_away
        df = pd.DataFrame(data={'team': [player], 'points': [points], 'group': teams2groups[acr]})
        players_results = (
            pd.concat([players_results, df], axis=0)
            .sort_values(by=['group', 'points'], ascending=[True, False])
        )
    players_results['ranking'] = players_results.groupby("group")['points'].rank(ascending=False).astype('int')
    if filter_winners:
        players_results = players_results[players_results.ranking <= 2]
    return players_results

In [None]:
df_final_groupmatches_results = final_groupmatches_results(df_groupmatches_results, players2022, teams2groups)
df_final_groupmatches_results

In [None]:
final_groupmatches_results(df_groupmatches_results, players2022, teams2groups, filter_winners=True)

In [None]:
df_final_groupmatches_results.to_csv("predictions/final_groupmatches_results.csv")

In [None]:
!ls predictions/

In [None]:
def see_team_results(df_groupmatches_results, team):
    df = df_groupmatches_results
    return df[(df.home_team == team) | (df.away_team == team)]

see_team_results(df_groupmatches_results, 'Brazil')

## Simular Eliminatória

In [None]:
teams_qualified = final_groupmatches_results(
    df_groupmatches_results,
    players2022,
    teams2groups,
    filter_winners=True
).team.to_list()

teams_qualified.remove("Mexico")

print("number teams: ", len(teams_qualified))

teams_qualified

In [None]:
X_mata_mata = 