In [160]:
import pandas as pd
import numpy as np

In [161]:
df = pd.read_parquet("../data/matches.parquet")

In [162]:
df.info()
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   round             2020 non-null   object        
 1   match_date        2016 non-null   datetime64[ns]
 2   home_team         2020 non-null   object        
 3   guest_team        2020 non-null   object        
 4   stadium           2020 non-null   object        
 5   date_added        2020 non-null   datetime64[ns]
 6   score_home_team   2016 non-null   Int64         
 7   score_guest_team  2016 non-null   Int64         
 8   winning_team      2016 non-null   object        
dtypes: Int64(2), datetime64[ns](2), object(5)
memory usage: 146.1+ KB
      round          match_date  home_team     guest_team         stadium  \
0  Rodada 1 2020-08-08 19:00:00  Fortaleza   Athletico-PR   Castelão (CE)   
1  Rodada 1 2020-08-08 19:30:00   Coritiba  Internacional   Couto Pereira  

In [163]:
df = df.dropna()

Historical performance: Rolling average of points per game for each team over last N matches (e.g., last 5-10 games)

In [164]:
import pandas as pd

# 1. Create a long-format performance DataFrame upfront
def create_performance_df(match_df):
    # Home team performances
    home = match_df[['match_date', 'home_team', 'score_home_team', 'score_guest_team', 'winning_team']].copy()
    home['team'] = home['home_team']
    home['opponent'] = home['score_guest_team']
    home['is_home'] = True
    home['points'] = home['winning_team'].map({'home': 3, 'draw': 1, 'guest': 0})
    
    # Away team performances
    away = match_df[['match_date', 'guest_team', 'score_home_team', 'score_guest_team', 'winning_team']].copy()
    away['team'] = away['guest_team']
    away['opponent'] = away['score_home_team']
    away['is_home'] = False
    away['points'] = away['winning_team'].map({'home': 0, 'draw': 1, 'guest': 3})
    
    # Combine and sort
    perf_df = pd.concat([home, away], ignore_index=True)
    perf_df = perf_df.sort_values(['team', 'match_date'])
    
    # Calculate match sequence number for each team
    perf_df['team_match_seq'] = perf_df.groupby('team').cumcount() + 1
    
    return perf_df[['match_date', 'team', 'opponent', 'is_home', 'points', 'team_match_seq']]

# 2. Create feature calculation functions
def calculate_features(perf_df, window=5):
    # Ensure chronological order
    perf_df = perf_df.sort_values(['team', 'match_date'])
    
    # Rolling features
    perf_df['rolling_points_avg'] = perf_df.groupby('team')['points'].transform(
        lambda x: x.rolling(window, min_periods=1).mean().shift(1))
    
    perf_df['rolling_points_std'] = perf_df.groupby('team')['points'].transform(
        lambda x: x.rolling(window, min_periods=1).std().shift(1))
    
    # Can add more features here (form, streaks, etc.)
    return perf_df

# 3. Create match-level features by joining performance data
def create_match_features(match_df, perf_df):
    # Home team features
    match_df = match_df.merge(
        perf_df.add_prefix('home_'),
        left_on=['match_date', 'home_team'],
        right_on=['home_match_date', 'home_team'],
        how='left'
    )
    
    # Away team features
    match_df = match_df.merge(
        perf_df.add_prefix('away_'),
        left_on=['match_date', 'guest_team'],
        right_on=['away_match_date', 'away_team'],
        how='left'
    )
    
    return match_df

# Main workflow
# Assuming df is your original DataFrame
perf_df = create_performance_df(df)
featured_perf_df = calculate_features(perf_df)
final_df = create_match_features(df, featured_perf_df)

Last 5 games win/draw/loss ratio for both teams

In [165]:
import pandas as pd
from collections import defaultdict

# Assuming final_df is your dataframe
# First sort by match date to ensure chronological order
final_df = final_df.sort_values('match_date')

# Create dictionaries to store team histories
team_history = defaultdict(list)

# Function to calculate last 5 matches performance
def get_last_5_performance(team, date):
    team_matches = team_history[team]
    last_5 = [m for m in team_matches if m['date'] < date][-5:]
    
    if not last_5:
        return {'win_ratio': None, 'draw_ratio': None, 'loss_ratio': None}
    
    wins = 0
    draws = 0
    losses = 0
    
    for match in last_5:
        if match['result'] == 'win':
            wins += 1
        elif match['result'] == 'draw':
            draws += 1
        else:
            losses += 1
    
    total = wins + draws + losses
    return {
        'win_ratio': wins / total,
        'draw_ratio': draws / total,
        'loss_ratio': losses / total
    }

# First pass to populate team histories
for _, row in final_df.iterrows():
    if pd.isna(row['match_date']) or pd.isna(row['winning_team']):
        continue
    
    home_team = row['home_team']
    guest_team = row['guest_team']
    match_date = row['match_date']
    
    # Determine results for each team
    if row['winning_team'] == 'home':
        home_result = 'win'
        guest_result = 'loss'
    elif row['winning_team'] == 'guest':
        home_result = 'loss'
        guest_result = 'win'
    else:  # draw
        home_result = 'draw'
        guest_result = 'draw'
    
    # Add to team histories
    team_history[home_team].append({
        'date': match_date,
        'result': home_result,
        'opponent': guest_team,
        'home_away': 'home'
    })
    
    team_history[guest_team].append({
        'date': match_date,
        'result': guest_result,
        'opponent': home_team,
        'home_away': 'away'
    })

# Second pass to calculate last 5 performance
home_win_ratios = []
home_draw_ratios = []
home_loss_ratios = []
guest_win_ratios = []
guest_draw_ratios = []
guest_loss_ratios = []

for _, row in final_df.iterrows():
    if pd.isna(row['match_date']):
        home_win_ratios.append(None)
        home_draw_ratios.append(None)
        home_loss_ratios.append(None)
        guest_win_ratios.append(None)
        guest_draw_ratios.append(None)
        guest_loss_ratios.append(None)
        continue
    
    home_perf = get_last_5_performance(row['home_team'], row['match_date'])
    guest_perf = get_last_5_performance(row['guest_team'], row['match_date'])
    
    home_win_ratios.append(home_perf['win_ratio'])
    home_draw_ratios.append(home_perf['draw_ratio'])
    home_loss_ratios.append(home_perf['loss_ratio'])
    
    guest_win_ratios.append(guest_perf['win_ratio'])
    guest_draw_ratios.append(guest_perf['draw_ratio'])
    guest_loss_ratios.append(guest_perf['loss_ratio'])

# Add new columns to dataframe
final_df['home_win_ratio_last5'] = home_win_ratios
final_df['home_draw_ratio_last5'] = home_draw_ratios
final_df['home_loss_ratio_last5'] = home_loss_ratios
final_df['guest_win_ratio_last5'] = guest_win_ratios
final_df['guest_draw_ratio_last5'] = guest_draw_ratios
final_df['guest_loss_ratio_last5'] = guest_loss_ratios

In [166]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2016 entries, 0 to 2015
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   round                    2016 non-null   object        
 1   match_date               2016 non-null   datetime64[ns]
 2   home_team                2016 non-null   object        
 3   guest_team               2016 non-null   object        
 4   stadium                  2016 non-null   object        
 5   date_added               2016 non-null   datetime64[ns]
 6   score_home_team          2016 non-null   Int64         
 7   score_guest_team         2016 non-null   Int64         
 8   winning_team             2016 non-null   object        
 9   home_match_date          2016 non-null   datetime64[ns]
 10  home_opponent            2016 non-null   Int64         
 11  home_is_home             2016 non-null   bool          
 12  home_points              2016 non-null 

In [167]:
# filter flamengo in home or guest team
final_df[(final_df['home_team'] == 'Flamengo') | (final_df['guest_team'] == 'Flamengo')][['home_team','guest_team', 'winning_team', 'home_win_ratio_last5', 'home_draw_ratio_last5', 
          'home_loss_ratio_last5', 'guest_win_ratio_last5', 'guest_draw_ratio_last5', 
          'guest_loss_ratio_last5']].head(10)

Unnamed: 0,home_team,guest_team,winning_team,home_win_ratio_last5,home_draw_ratio_last5,home_loss_ratio_last5,guest_win_ratio_last5,guest_draw_ratio_last5,guest_loss_ratio_last5
3,Flamengo,Atlético-MG,guest,,,,,,
14,Atlético-GO,Flamengo,home,,,,0.0,0.0,1.0
21,Coritiba,Flamengo,guest,0.0,0.0,1.0,0.0,0.0,1.0
30,Flamengo,Grêmio,draw,0.333333,0.0,0.666667,0.333333,0.666667,0.0
43,Flamengo,Botafogo,draw,0.25,0.25,0.5,0.333333,0.666667,0.0
55,Santos,Flamengo,guest,0.4,0.2,0.4,0.2,0.4,0.4
65,Bahia,Flamengo,guest,0.4,0.4,0.2,0.4,0.4,0.2
70,Flamengo,Fortaleza,home,0.6,0.4,0.0,0.4,0.4,0.2
84,Fluminense,Flamengo,guest,0.4,0.2,0.4,0.6,0.4,0.0
94,Ceará,Flamengo,home,0.6,0.0,0.4,0.8,0.2,0.0


In [168]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# Load data (assuming `final_df` from previous steps)
df = final_df.copy()

# Encode target variable (winning_team: home/draw/guest)
le = LabelEncoder()
df['target'] = le.fit_transform(df['winning_team'])  # 0=home, 1=draw, 2=guest

# Select features (modify as needed)
features = [
    'home_rolling_points_avg', 'away_rolling_points_avg',

    'home_win_ratio_last5', 'home_draw_ratio_last5', 'home_loss_ratio_last5', 'guest_win_ratio_last5', 'guest_draw_ratio_last5', 'guest_loss_ratio_last5'
]

X = df[features]
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train model
model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Decode labels back to original values
class_names = le.inverse_transform([0, 1, 2])  # ['home', 'draw', 'guest']

print(classification_report(
    y_test,
    y_pred,
    target_names=class_names,
    digits=3
))

# Get feature importances
importance = model.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': importance
}).sort_values('Importance', ascending=False)

print(feature_importance)

              precision    recall  f1-score   support

        draw      0.429     0.159     0.232       113
       guest      0.241     0.070     0.109       100
        home      0.495     0.864     0.630       191

    accuracy                          0.470       404
   macro avg      0.388     0.364     0.324       404
weighted avg      0.414     0.470     0.390       404

                   Feature  Importance
2     home_win_ratio_last5    0.205115
7   guest_loss_ratio_last5    0.129558
0  home_rolling_points_avg    0.127898
4    home_loss_ratio_last5    0.125052
6   guest_draw_ratio_last5    0.117788
1  away_rolling_points_avg    0.108549
5    guest_win_ratio_last5    0.098824
3    home_draw_ratio_last5    0.087215
