In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet("../data/matches.parquet")

In [3]:
df.info()
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2150 entries, 0 to 2149
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   round             2150 non-null   object        
 1   match_date        2144 non-null   datetime64[ns]
 2   home_team         2150 non-null   object        
 3   guest_team        2150 non-null   object        
 4   stadium           2150 non-null   object        
 5   date_added        2150 non-null   datetime64[ns]
 6   score_home_team   2144 non-null   Int64         
 7   score_guest_team  2144 non-null   Int64         
 8   winning_team      2144 non-null   string        
dtypes: Int64(2), datetime64[ns](2), object(4), string(1)
memory usage: 155.5+ KB
      round          match_date     home_team  guest_team  \
0  Rodada 1 2023-04-15 16:00:00     Palmeiras      Cuiabá   
1  Rodada 1 2023-04-15 16:00:00    América-MG  Fluminense   
2  Rodada 1 2023-04-15 18:30:00    

In [4]:
df = df.dropna()

Historical performance: Rolling average of points per game for each team over last N matches (e.g., last 5-10 games)

In [5]:
import pandas as pd

# 1. Create a long-format performance DataFrame upfront
def create_performance_df(match_df):
    # Home team performances
    home = match_df[['match_date', 'home_team', 'score_home_team', 'score_guest_team', 'winning_team']].copy()
    home['team'] = home['home_team']
    home['opponent'] = home['score_guest_team']
    home['is_home'] = True
    home['points'] = home['winning_team'].map({'home': 3, 'draw': 1, 'guest': 0})
    
    # Away team performances
    away = match_df[['match_date', 'guest_team', 'score_home_team', 'score_guest_team', 'winning_team']].copy()
    away['team'] = away['guest_team']
    away['opponent'] = away['score_home_team']
    away['is_home'] = False
    away['points'] = away['winning_team'].map({'home': 0, 'draw': 1, 'guest': 3})
    
    # Combine and sort
    perf_df = pd.concat([home, away], ignore_index=True)
    perf_df = perf_df.sort_values(['team', 'match_date'])
    
    # Calculate match sequence number for each team
    perf_df['team_match_seq'] = perf_df.groupby('team').cumcount() + 1
    
    return perf_df[['match_date', 'team', 'opponent', 'is_home', 'points', 'team_match_seq']]

# 2. Create feature calculation functions
def calculate_features(perf_df, window=5):
    # Ensure chronological order
    perf_df = perf_df.sort_values(['team', 'match_date'])
    
    # Rolling features
    perf_df['rolling_points_avg'] = perf_df.groupby('team')['points'].transform(
        lambda x: x.rolling(window, min_periods=1).mean().shift(1))
    
    perf_df['rolling_points_std'] = perf_df.groupby('team')['points'].transform(
        lambda x: x.rolling(window, min_periods=1).std().shift(1))
    
    # Can add more features here (form, streaks, etc.)
    return perf_df

# 3. Create match-level features by joining performance data
def create_match_features(match_df, perf_df):
    # Home team features
    match_df = match_df.merge(
        perf_df.add_prefix('home_'),
        left_on=['match_date', 'home_team'],
        right_on=['home_match_date', 'home_team'],
        how='left'
    )
    
    # Away team features
    match_df = match_df.merge(
        perf_df.add_prefix('away_'),
        left_on=['match_date', 'guest_team'],
        right_on=['away_match_date', 'away_team'],
        how='left'
    )
    
    return match_df

# Main workflow
# Assuming df is your original DataFrame
perf_df = create_performance_df(df)
featured_perf_df = calculate_features(perf_df)
final_df = create_match_features(df, featured_perf_df)

Last 5 games win/draw/loss ratio for both teams

In [6]:
import pandas as pd
from collections import defaultdict

# Assuming final_df is your dataframe
# First sort by match date to ensure chronological order
final_df = final_df.sort_values('match_date')

# Create dictionaries to store team histories
team_history = defaultdict(list)

# Function to calculate last 5 matches performance
def get_last_5_performance(team, date):
    team_matches = team_history[team]
    last_5 = [m for m in team_matches if m['date'] < date][-5:]
    
    if not last_5:
        return {'win_ratio': None, 'draw_ratio': None, 'loss_ratio': None}
    
    wins = 0
    draws = 0
    losses = 0
    
    for match in last_5:
        if match['result'] == 'win':
            wins += 1
        elif match['result'] == 'draw':
            draws += 1
        else:
            losses += 1
    
    total = wins + draws + losses
    return {
        'win_ratio': wins / total,
        'draw_ratio': draws / total,
        'loss_ratio': losses / total
    }

# First pass to populate team histories
for _, row in final_df.iterrows():
    if pd.isna(row['match_date']) or pd.isna(row['winning_team']):
        continue
    
    home_team = row['home_team']
    guest_team = row['guest_team']
    match_date = row['match_date']
    
    # Determine results for each team
    if row['winning_team'] == 'home':
        home_result = 'win'
        guest_result = 'loss'
    elif row['winning_team'] == 'guest':
        home_result = 'loss'
        guest_result = 'win'
    else:  # draw
        home_result = 'draw'
        guest_result = 'draw'
    
    # Add to team histories
    team_history[home_team].append({
        'date': match_date,
        'result': home_result,
        'opponent': guest_team,
        'home_away': 'home'
    })
    
    team_history[guest_team].append({
        'date': match_date,
        'result': guest_result,
        'opponent': home_team,
        'home_away': 'away'
    })

# Second pass to calculate last 5 performance
home_win_ratios = []
home_draw_ratios = []
home_loss_ratios = []
guest_win_ratios = []
guest_draw_ratios = []
guest_loss_ratios = []

for _, row in final_df.iterrows():
    if pd.isna(row['match_date']):
        home_win_ratios.append(None)
        home_draw_ratios.append(None)
        home_loss_ratios.append(None)
        guest_win_ratios.append(None)
        guest_draw_ratios.append(None)
        guest_loss_ratios.append(None)
        continue
    
    home_perf = get_last_5_performance(row['home_team'], row['match_date'])
    guest_perf = get_last_5_performance(row['guest_team'], row['match_date'])
    
    home_win_ratios.append(home_perf['win_ratio'])
    home_draw_ratios.append(home_perf['draw_ratio'])
    home_loss_ratios.append(home_perf['loss_ratio'])
    
    guest_win_ratios.append(guest_perf['win_ratio'])
    guest_draw_ratios.append(guest_perf['draw_ratio'])
    guest_loss_ratios.append(guest_perf['loss_ratio'])

# Add new columns to dataframe
final_df['home_win_ratio_last5'] = home_win_ratios
final_df['home_draw_ratio_last5'] = home_draw_ratios
final_df['home_loss_ratio_last5'] = home_loss_ratios
final_df['guest_win_ratio_last5'] = guest_win_ratios
final_df['guest_draw_ratio_last5'] = guest_draw_ratios
final_df['guest_loss_ratio_last5'] = guest_loss_ratios

Goals scored/conceded in last N matches

In [7]:
# Assuming your dataframe is named 'df'
# First, sort by match_date to ensure chronological order
final_df = final_df.sort_values('match_date').reset_index(drop=True)

# Define the number of previous matches to consider
N = 5  # You can adjust this based on your needs

# Create a long format dataframe with all team performances
home_games = final_df[['match_date', 'home_team', 'score_home_team', 'score_guest_team']].copy()
home_games['is_home'] = True
home_games = home_games.rename(columns={
    'home_team': 'team',
    'score_home_team': 'goals_scored',
    'score_guest_team': 'goals_conceded'
})

away_games = final_df[['match_date', 'guest_team', 'score_guest_team', 'score_home_team']].copy()
away_games['is_home'] = False
away_games = away_games.rename(columns={
    'guest_team': 'team',
    'score_guest_team': 'goals_scored',
    'score_home_team': 'goals_conceded'
})

all_games = pd.concat([home_games, away_games]).sort_values(['team', 'match_date'])

# Calculate rolling averages
for team in all_games['team'].unique():
    team_mask = all_games['team'] == team
    all_games.loc[team_mask, 'goals_scored_avg'] = (
        all_games.loc[team_mask, 'goals_scored']
        .rolling(N, min_periods=1)
        .mean()
        .shift(1)  # Use previous matches only
    )
    all_games.loc[team_mask, 'goals_conceded_avg'] = (
        all_games.loc[team_mask, 'goals_conceded']
        .rolling(N, min_periods=1)
        .mean()
        .shift(1)
    )

# Merge back to original dataframe
# For home teams
final_df = final_df.merge(
    all_games[all_games['is_home']][['match_date', 'team', 'goals_scored_avg', 'goals_conceded_avg']],
    left_on=['match_date', 'home_team'],
    right_on=['match_date', 'team'],
    how='left'
).rename(columns={
    'goals_scored_avg': f'home_team_goals_scored_last_{N}',
    'goals_conceded_avg': f'home_team_goals_conceded_last_{N}'
}).drop(columns=['team'])

# For away teams
final_df = final_df.merge(
    all_games[~all_games['is_home']][['match_date', 'team', 'goals_scored_avg', 'goals_conceded_avg']],
    left_on=['match_date', 'guest_team'],
    right_on=['match_date', 'team'],
    how='left'
).rename(columns={
    'goals_scored_avg': f'guest_team_goals_scored_last_{N}',
    'goals_conceded_avg': f'guest_team_goals_conceded_last_{N}'
}).drop(columns=['team'])

Clean sheet frequency (games without conceding)

In [8]:
def calculate_cs_frequency(df, window=5):
    """
    Calculate clean sheet frequency for last N matches regardless of home/away status
    Returns two Series: (home_team_cs, guest_team_cs)
    """
    # Initialize dictionaries to track team histories
    team_history = {}
    home_cs = pd.Series(np.nan, index=df.index)
    guest_cs = pd.Series(np.nan, index=df.index)
    
    # First collect all matches for each team
    all_matches = []
    
    for idx, row in df.iterrows():
        # For home team perspective
        all_matches.append({
            'team': row['home_team'],
            'date': row['match_date'],
            'goals_conceded': row['score_guest_team'],
            'idx': idx,
            'type': 'home'
        })
        # For away team perspective
        all_matches.append({
            'team': row['guest_team'],
            'date': row['match_date'],
            'goals_conceded': row['score_home_team'],
            'idx': idx,
            'type': 'guest'
        })
    
    # Sort all matches by date
    all_matches_sorted = sorted(all_matches, key=lambda x: x['date'])
    
    # Process each team separately
    for team in df['home_team'].unique():
        team_matches = [m for m in all_matches_sorted if m['team'] == team]
        
        for i, match in enumerate(team_matches):
            prev_matches = team_matches[max(0, i-window):i]
            
            if prev_matches:
                clean_sheets = sum(1 for m in prev_matches if m['goals_conceded'] == 0)
                freq = clean_sheets / len(prev_matches)
            else:
                freq = 0
            
            # Assign to appropriate series
            if match['type'] == 'home':
                home_cs[match['idx']] = freq
            else:
                guest_cs[match['idx']] = freq
    
    return home_cs, guest_cs

# Calculate clean sheet frequencies
final_df['home_team_cs_freq_last5'], final_df['guest_team_cs_freq_last5'] = calculate_cs_frequency(final_df, window=5)

KeyboardInterrupt: 

Historical H2H record between the two teams (win %)

In [None]:
def calculate_correct_h2h_win_percentage(df, window=5):
    """
    Correctly calculates head-to-head win percentages for both teams
    Returns: home_team_h2h_win_pct, guest_team_h2h_win_pct, h2h_draw_pct
    """
    # Initialize results
    home_win_pct = pd.Series(0.0, index=df.index)
    guest_win_pct = pd.Series(0.0, index=df.index)
    draw_pct = pd.Series(0.0, index=df.index)
    
    # Dictionary to store all matches between team pairs
    h2h_history = {}
    
    # Sort dataframe by match date to process in chronological order
    df_sorted = df.sort_values('match_date').reset_index(drop=True)
    
    for idx, row in df_sorted.iterrows():
        home = row['home_team']
        guest = row['guest_team']
        
        # Create consistent team pair key (sorted alphabetically)
        team_pair = tuple(sorted((home, guest)))
        
        if team_pair not in h2h_history:
            h2h_history[team_pair] = []
        
        # Get previous matches between these teams (excluding current match)
        prev_matches = h2h_history[team_pair][-window:]
        
        # Initialize counters
        home_wins = 0
        guest_wins = 0
        draws = 0
        
        # Analyze previous matches
        for match in prev_matches:
            if match['winner'] == 'home':
                if match['home_team'] == home:
                    home_wins += 1
                else:
                    guest_wins += 1
            elif match['winner'] == 'guest':
                if match['guest_team'] == home:
                    home_wins += 1
                else:
                    guest_wins += 1
            else:
                draws += 1
        
        # Calculate percentages
        total_matches = len(prev_matches)
        if total_matches > 0:
            home_win_pct[idx] = home_wins / total_matches
            guest_win_pct[idx] = guest_wins / total_matches
            draw_pct[idx] = draws / total_matches
        
        # Store current match in history
        h2h_history[team_pair].append({
            'home_team': home,
            'guest_team': guest,
            'winner': row['winning_team']
        })
    
    # Reindex to match original dataframe order
    home_win_pct = home_win_pct.reindex(df.index)
    guest_win_pct = guest_win_pct.reindex(df.index)
    draw_pct = draw_pct.reindex(df.index)
    
    return home_win_pct, guest_win_pct, draw_pct

# Calculate the correct H2H percentages
final_df['home_team_h2h_win_pct'], final_df['guest_team_h2h_win_pct'], final_df['h2h_draw_pct'] = \
    calculate_correct_h2h_win_percentage(final_df, window=5)

Goal difference trend in H2H matches

In [None]:
def calculate_h2h_goal_diff_trend(df, window=5):
    """
    Calculate goal difference trend in last N H2H matches between teams
    Returns two Series: (home_team_h2h_gd_trend, guest_team_h2h_gd_trend)
    """
    # Initialize results
    home_gd_trend = pd.Series(0.0, index=df.index)
    guest_gd_trend = pd.Series(0.0, index=df.index)
    
    # Dictionary to store all matches between team pairs
    h2h_history = {}
    
    # Sort dataframe by match date to process in chronological order
    df_sorted = df.sort_values('match_date').reset_index(drop=True)
    
    for idx, row in df_sorted.iterrows():
        home = row['home_team']
        guest = row['guest_team']
        
        # Create consistent team pair key (sorted alphabetically)
        team_pair = tuple(sorted((home, guest)))
        
        if team_pair not in h2h_history:
            h2h_history[team_pair] = []
        
        # Get previous matches between these teams (excluding current match)
        prev_matches = h2h_history[team_pair][-window:]
        
        # Initialize goal difference accumulators
        home_gd = 0
        guest_gd = 0
        
        # Analyze previous matches
        for match in prev_matches:
            if match['home_team'] == home:
                # Current home team was home in this historical match
                home_gd += match['home_score'] - match['guest_score']
                guest_gd += match['guest_score'] - match['home_score']
            else:
                # Current home team was away in this historical match
                home_gd += match['guest_score'] - match['home_score']
                guest_gd += match['home_score'] - match['guest_score']
        
        # Calculate average goal difference
        if prev_matches:
            home_gd_trend[idx] = home_gd / len(prev_matches)
            guest_gd_trend[idx] = guest_gd / len(prev_matches)
        
        # Store current match in history
        h2h_history[team_pair].append({
            'home_team': home,
            'guest_team': guest,
            'home_score': row['score_home_team'],
            'guest_score': row['score_guest_team']
        })
    
    # Reindex to match original dataframe order
    home_gd_trend = home_gd_trend.reindex(df.index)
    guest_gd_trend = guest_gd_trend.reindex(df.index)
    
    return home_gd_trend, guest_gd_trend

# Calculate H2H goal difference trends
final_df['home_team_h2h_gd_trend'], final_df['guest_team_h2h_gd_trend'] = \
    calculate_h2h_goal_diff_trend(final_df, window=5)

In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   round                             2016 non-null   object        
 1   match_date                        2016 non-null   datetime64[ns]
 2   home_team                         2016 non-null   object        
 3   guest_team                        2016 non-null   object        
 4   stadium                           2016 non-null   object        
 5   date_added                        2016 non-null   datetime64[ns]
 6   score_home_team                   2016 non-null   Int64         
 7   score_guest_team                  2016 non-null   Int64         
 8   winning_team                      2016 non-null   object        
 9   home_match_date                   2016 non-null   datetime64[ns]
 10  home_opponent                     2016 non-null 

In [None]:
final_df['home_adv_pts_diff'] = final_df['home_rolling_points_avg'] - final_df['away_rolling_points_avg']
final_df['home_goal_dominance_last5'] = final_df['home_team_goals_scored_last_5'] - final_df['home_team_goals_conceded_last_5']
final_df['away_goal_dominance_last5'] = final_df['guest_team_goals_scored_last_5'] - final_df['guest_team_goals_conceded_last_5']
final_df['home_win_power'] = final_df['home_win_ratio_last5'] * final_df['home_rolling_points_avg']
final_df['away_win_power'] = final_df['guest_win_ratio_last5'] * final_df['away_rolling_points_avg']
final_df['is_stalemate'] = (final_df['home_rolling_points_avg'] - final_df['away_rolling_points_avg']).abs() < 0.5
# Avoid division by zero by adding a small epsilon (e.g., 0.1)
epsilon = 0.1

final_df['home_defense_strength'] = (
    final_df['home_team_cs_freq_last5'] / 
    (final_df['home_team_goals_conceded_last_5'] + epsilon)
)

final_df['away_defense_strength'] = (
    final_df['guest_team_cs_freq_last5'] / 
    (final_df['guest_team_goals_conceded_last_5'] + epsilon)
)

final_df['home_away_strength_diff'] = final_df['home_rolling_points_avg'] - final_df['away_rolling_points_avg']

final_df['home_attack_defense_ratio'] = final_df['home_team_goals_scored_last_5'] / (final_df['home_team_goals_conceded_last_5'] + 0.1)
final_df['away_attack_defense_ratio'] = final_df['guest_team_goals_scored_last_5'] / (final_df['guest_team_goals_conceded_last_5'] + 0.1)

final_df['home_win_streak'] = final_df['home_win_ratio_last5'] * final_df['home_team_match_seq']
final_df['away_loss_streak'] = final_df['guest_loss_ratio_last5'] * final_df['away_team_match_seq']

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load data (assuming `final_df` from previous steps)
df = final_df.copy()

# Encode target variable (winning_team: home/draw/guest)
le = LabelEncoder()
df['target'] = le.fit_transform(df['winning_team'])  # 0=home, 1=draw, 2=guest

# Select features (modify as needed)
features = [
    'home_rolling_points_avg', 'away_rolling_points_avg',

    'home_win_ratio_last5', 'home_draw_ratio_last5', 'home_loss_ratio_last5', 'guest_win_ratio_last5', 'guest_draw_ratio_last5', 'guest_loss_ratio_last5',

    'home_team_goals_scored_last_5', 'home_team_goals_conceded_last_5', 'guest_team_goals_scored_last_5', 'guest_team_goals_conceded_last_5',

    'home_team_cs_freq_last5', 'guest_team_cs_freq_last5',

    'home_team_h2h_win_pct', 'guest_team_h2h_win_pct', 'h2h_draw_pct',

    'home_team_h2h_gd_trend', 'guest_team_h2h_gd_trend',

    'home_adv_pts_diff', 'home_goal_dominance_last5', 'away_goal_dominance_last5',

    'home_win_power', 'away_win_power', 'is_stalemate', 'home_defense_strength', 'away_defense_strength',

    'home_away_strength_diff', 'home_attack_defense_ratio', 'away_attack_defense_ratio', 'home_win_streak', 'away_loss_streak'
]

df = df.dropna(subset=features)

X = df[features]
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_res, y_res = smote.fit_resample(X_train, y_train)

model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=300,  # Increased
    max_depth=5,       # Deeper trees
    learning_rate=0.03,
    subsample=0.8,     # Prevent overfitting
    colsample_bytree=0.8,
    reg_alpha=0.1,     # L1 regularization
    reg_lambda=0.1,    # L2 regularization
    random_state=42
)

from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000,              # More trees
    depth=6,                     # Slightly deeper
    learning_rate=0.02,           # Slower learning
    l2_leaf_reg=5,               # Stronger regularization
    class_weights=[1, 3, 1],     # Focus on draws
    eval_metric='TotalF1',       # Optimize for F1-score
    early_stopping_rounds=50,    # Prevent overfitting
    verbose=0
)

# Pass sample_weight during training
model.fit(X_res, y_res)

# Predictions
y_pred = model.predict(X_test)

# Decode labels back to original values
class_names = le.inverse_transform([0, 1, 2])  # ['home', 'draw', 'guest']

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

print(classification_report(
    y_test,
    y_pred,
    target_names=class_names,
    digits=3
))

# Get feature importances
importance = model.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': importance
}).sort_values('Importance', ascending=False)

print(feature_importance)

[[  1  89  24]
 [  2  77  11]
 [  4 145  46]]
              precision    recall  f1-score   support

        draw      0.143     0.009     0.017       114
       guest      0.248     0.856     0.384        90
        home      0.568     0.236     0.333       195

    accuracy                          0.311       399
   macro avg      0.319     0.367     0.245       399
weighted avg      0.374     0.311     0.254       399

                             Feature  Importance
6             guest_draw_ratio_last5    6.108260
3              home_draw_ratio_last5    6.050317
31                  away_loss_streak    6.016248
12           home_team_cs_freq_last5    5.201857
30                   home_win_streak    4.842217
15            guest_team_h2h_win_pct    4.423915
10    guest_team_goals_scored_last_5    4.339331
8      home_team_goals_scored_last_5    3.782670
16                      h2h_draw_pct    3.657375
14             home_team_h2h_win_pct    3.385343
28         home_attack_defense_rat

Neural Network

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Step 1: SMOTE to boost minority classes (if any)
# Step 2: RandomUnderSampler to reduce majority class (draws)
pipeline = Pipeline([
    ('oversample', SMOTE(sampling_strategy='minority')),  # Boost home/away wins
    ('undersample', RandomUnderSampler(sampling_strategy={1: 150}))  # Reduce draws
])

X_res, y_res = pipeline.fit_resample(X_train, y_train)
X_res, y_res = torch.FloatTensor(X_res), torch.LongTensor(y_res)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)

# Create DataLoader for batching
train_data = TensorDataset(X_res, y_res)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

class SoccerPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.drop1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.drop2 = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(32, 3)  # 3 output classes
        
    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        
        x = self.fc3(x)  # No softmax (handled in loss)
        return x

model = SoccerPredictor(input_size=len(features))

# Class weights (penalize misclassifying draws)
class_counts = np.bincount(y_train)
class_weights = 1. / class_counts
class_weights = torch.FloatTensor(class_weights / class_weights.sum())
# Adjust class weights (penalize missing draws more)
class_weights = torch.FloatTensor([1, 3, 1])  # [home, draw, away]

criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss with class weights (weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

epochs = 100

for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        _, preds = torch.max(test_outputs, 1)
        accuracy = (preds == y_test).float().mean()
    
    print(f"Epoch {epoch+1}/{epochs} | Loss: {loss.item():.4f} | Test Acc: {accuracy:.4f}")

from sklearn.metrics import classification_report, confusion_matrix

model.eval()
with torch.no_grad():
    y_pred = model(X_test).argmax(dim=1)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['home', 'draw', 'away']))

Epoch 1/100 | Loss: 0.6919 | Test Acc: 0.2531
Epoch 2/100 | Loss: 1.4095 | Test Acc: 0.3759
Epoch 3/100 | Loss: 1.3032 | Test Acc: 0.4436
Epoch 4/100 | Loss: 1.1454 | Test Acc: 0.4712
Epoch 5/100 | Loss: 0.5684 | Test Acc: 0.4887
Epoch 6/100 | Loss: 0.5620 | Test Acc: 0.4912
Epoch 7/100 | Loss: 1.0399 | Test Acc: 0.4912
Epoch 8/100 | Loss: 0.5779 | Test Acc: 0.4862
Epoch 9/100 | Loss: 0.5354 | Test Acc: 0.4887
Epoch 10/100 | Loss: 0.5540 | Test Acc: 0.4887
Epoch 11/100 | Loss: 1.2557 | Test Acc: 0.4887
Epoch 12/100 | Loss: 1.1698 | Test Acc: 0.4887
Epoch 13/100 | Loss: 1.0950 | Test Acc: 0.4937
Epoch 14/100 | Loss: 0.6117 | Test Acc: 0.4862
Epoch 15/100 | Loss: 1.2947 | Test Acc: 0.4887
Epoch 16/100 | Loss: 0.5155 | Test Acc: 0.4937
Epoch 17/100 | Loss: 0.4174 | Test Acc: 0.4862
Epoch 18/100 | Loss: 0.4722 | Test Acc: 0.4937
Epoch 19/100 | Loss: 1.2593 | Test Acc: 0.4987
Epoch 20/100 | Loss: 0.8057 | Test Acc: 0.4987
Epoch 21/100 | Loss: 0.4959 | Test Acc: 0.4887
Epoch 22/100 | Loss: 1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
