In [1]:
import soccerdata as sd

In [2]:
fbref = sd.FBref(leagues="BRA-Brazil", seasons="2025")

In [3]:
matches_full = fbref.read_schedule()
matches_full = matches_full.reset_index()
matches = matches_full[['week', 'home_team', 'home_xg', 'away_xg', 'away_team']]
matches.head()

Unnamed: 0,week,home_team,home_xg,away_xg,away_team
0,1,Cruzeiro,1.2,1.7,Mirassol
1,1,Flamengo,1.8,0.4,Internacional
2,1,Fortaleza,0.9,0.9,Fluminense
3,1,Grêmio,0.8,2.9,Atlético Mineiro
4,1,Juventude,0.7,0.6,Vitória


In [4]:
team_stats_by_week_full = fbref.read_team_match_stats()
team_stats_by_week_full = team_stats_by_week_full.reset_index()
team_stats_by_week = team_stats_by_week_full[['team', 'round', 'xG', 'xGA']]
team_stats_by_week = team_stats_by_week.dropna()
mask = team_stats_by_week['round'].str.contains('Matchweek')
team_stats_by_week = team_stats_by_week[mask].copy()
# Extract the number that immediately follows "Matchweek "
team_stats_by_week['round'] = team_stats_by_week['round'].str.extract(r'Matchweek (\d+)',expand=False).astype(int)
team_stats_by_week.head()

Unnamed: 0,team,round,xG,xGA
0,Atlético Mineiro,1,2.9,0.8
2,Atlético Mineiro,2,1.4,0.3
4,Atlético Mineiro,3,1.8,1.3
5,Atlético Mineiro,4,0.8,1.7
6,Atlético Mineiro,5,0.8,0.3


In [5]:
import pandas as pd
# --- 1. Prepare Rolling/Expanding Stats in team_stats_by_week ---

# The rolling window size is 3, but the calculation must be *cumulative/expanding*
# for the early weeks (1, 2, 3).

# Step 1a: Sort the data to ensure the calculation is strictly chronological
team_stats_by_week_sorted = team_stats_by_week.sort_values(by=['team', 'round'])

# Step 1b: Calculate the *cumulative* sum and count of xG and xGA for each team,
# up to the *current* round (including the current round).
team_stats_by_week_sorted['xG_cumsum'] = team_stats_by_week_sorted.groupby('team')['xG'].cumsum()
team_stats_by_week_sorted['xGA_cumsum'] = team_stats_by_week_sorted.groupby('team')['xGA'].cumsum()
team_stats_by_week_sorted['game_count'] = team_stats_by_week_sorted.groupby('team')['round'].cumcount() + 1

# Step 1c: Calculate the Expanding Average (Avg of ALL previous games)
# Note: This is NOT the final 3-game average yet, but it handles the early weeks' averaging.
team_stats_by_week_sorted['xG_expanding_avg'] = (
    team_stats_by_week_sorted['xG_cumsum'] / team_stats_by_week_sorted['game_count']
)
team_stats_by_week_sorted['xGA_expanding_avg'] = (
    team_stats_by_week_sorted['xGA_cumsum'] / team_stats_by_week_sorted['game_count']
)


# Step 1d: Calculate the 3-Game Rolling Sum (for rounds >= 3)
# The rolling calculation only works when the window is full (min_periods=3 is the default).
team_stats_by_week_sorted['xG_3_game_sum'] = team_stats_by_week_sorted.groupby('team')['xG'].rolling(window=3, min_periods=1).sum().reset_index(level=0, drop=True)
team_stats_by_week_sorted['xGA_3_game_sum'] = team_stats_by_week_sorted.groupby('team')['xGA'].rolling(window=3, min_periods=1).sum().reset_index(level=0, drop=True)
team_stats_by_week_sorted['game_count_3_window'] = team_stats_by_week_sorted['game_count'].apply(lambda x: min(x, 3))


# Step 1e: Calculate the Final Trailing Average (handles both expanding and 3-game rolling)
# This average is the sum of the last 3 games (or fewer for early weeks) divided by the count of games in the window.
team_stats_by_week_sorted['xG_trailing'] = (
    team_stats_by_week_sorted['xG_3_game_sum'] / team_stats_by_week_sorted['game_count_3_window']
)
team_stats_by_week_sorted['xGA_trailing'] = (
    team_stats_by_week_sorted['xGA_3_game_sum'] / team_stats_by_week_sorted['game_count_3_window']
)

# Step 1f: Shift the data to get the average *BEFORE* the current round (week - 1)
# This is the crucial step to prevent data leakage (look-ahead bias).
team_stats_by_week_sorted['xG_trailing_before'] = team_stats_by_week_sorted.groupby('team')['xG_trailing'].shift(1)
team_stats_by_week_sorted['xGA_trailing_before'] = team_stats_by_week_sorted.groupby('team')['xGA_trailing'].shift(1)


# Step 1g: Handle the special case for Round 1
# Since the 'shift(1)' makes Round 1 NaN, we set the specified default of 1.0.
team_stats_by_week_sorted['xG_trailing_before'] = team_stats_by_week_sorted['xG_trailing_before'].fillna(1.0)
team_stats_by_week_sorted['xGA_trailing_before'] = team_stats_by_week_sorted['xGA_trailing_before'].fillna(1.0)


# Step 1h: Select only the required columns for merging
trailing_stats = team_stats_by_week_sorted[['team', 'round', 'xG_trailing_before', 'xGA_trailing_before']].copy()
# Rename 'round' to 'week' to match the 'matches' DataFrame for merging
trailing_stats.rename(columns={'round': 'week'}, inplace=True)

# --- 2. Merge Trailing Stats into the Matches DataFrame ---

# Step 2a: Merge HOME team stats
matches = pd.merge(
    matches,
    trailing_stats,
    left_on=['home_team', 'week'],
    right_on=['team', 'week'],
    how='left'
)
# Rename the new columns and drop the redundant 'team' column
matches.rename(columns={
    'xG_trailing_before': 'home_xg_trailing',
    'xGA_trailing_before': 'home_xga_trailing'
}, inplace=True)
matches.drop(columns='team', inplace=True)

# Step 2b: Merge AWAY team stats
matches = pd.merge(
    matches,
    trailing_stats,
    left_on=['away_team', 'week'],
    right_on=['team', 'week'],
    how='left'
)
# Rename the new columns and drop the redundant 'team' column
matches.rename(columns={
    'xG_trailing_before': 'away_xg_trailing',
    'xGA_trailing_before': 'away_xga_trailing'
}, inplace=True)
matches.drop(columns='team', inplace=True)
matches = matches.loc[:, ~matches.columns.duplicated()]
matches = matches.dropna()
# --- Final Check ---
matches

Unnamed: 0,week,home_team,home_xg,away_xg,away_team,home_xg_trailing,home_xga_trailing,away_xg_trailing,away_xga_trailing
0,1,Cruzeiro,1.2,1.7,Mirassol,1.000000,1.000000,1.000000,1.000000
1,1,Flamengo,1.8,0.4,Internacional,1.000000,1.000000,1.000000,1.000000
2,1,Fortaleza,0.9,0.9,Fluminense,1.000000,1.000000,1.000000,1.000000
3,1,Grêmio,0.8,2.9,Atlético Mineiro,1.000000,1.000000,1.000000,1.000000
4,1,Juventude,0.7,0.6,Vitória,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...
260,27,Ceará,1.2,1.5,Santos,1.100000,1.333333,1.700000,0.566667
261,27,Cruzeiro,1.7,2.3,Sport Recife,0.966667,0.966667,0.833333,1.200000
262,27,Juventude,1.7,1.8,Fortaleza,0.433333,1.200000,1.000000,1.466667
263,27,São Paulo,2.0,1.2,Palmeiras,0.933333,1.033333,1.600000,0.800000


In [6]:
matches['xg_difference'] = matches['home_xg'] - matches['away_xg']
matches

Unnamed: 0,week,home_team,home_xg,away_xg,away_team,home_xg_trailing,home_xga_trailing,away_xg_trailing,away_xga_trailing,xg_difference
0,1,Cruzeiro,1.2,1.7,Mirassol,1.000000,1.000000,1.000000,1.000000,-0.5
1,1,Flamengo,1.8,0.4,Internacional,1.000000,1.000000,1.000000,1.000000,1.4
2,1,Fortaleza,0.9,0.9,Fluminense,1.000000,1.000000,1.000000,1.000000,0.0
3,1,Grêmio,0.8,2.9,Atlético Mineiro,1.000000,1.000000,1.000000,1.000000,-2.1
4,1,Juventude,0.7,0.6,Vitória,1.000000,1.000000,1.000000,1.000000,0.1
...,...,...,...,...,...,...,...,...,...,...
260,27,Ceará,1.2,1.5,Santos,1.100000,1.333333,1.700000,0.566667,-0.3
261,27,Cruzeiro,1.7,2.3,Sport Recife,0.966667,0.966667,0.833333,1.200000,-0.6
262,27,Juventude,1.7,1.8,Fortaleza,0.433333,1.200000,1.000000,1.466667,-0.1
263,27,São Paulo,2.0,1.2,Palmeiras,0.933333,1.033333,1.600000,0.800000,0.8


In [7]:
# mask_to_keep = ~matches['week'].isin([1])
# matches = matches[mask_to_keep]
# matches.head()

**Now tha the dataframe is ready, we can start the creation of the model**
**Reminder: I need to test with the team name and without the team name**

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. Load your DataFrame
df = matches  

# 2. Define new features and target
features = [
    'home_team', 'away_team',
    'home_xg_trailing', 'home_xga_trailing',
    'away_xg_trailing', 'away_xga_trailing'
]
target = 'xg_difference'

X = df[features]
y = df[target]

# 3. Categorical and numerical features
categorical_features = ['home_team', 'away_team']
numerical_features = list(set(features) - set(categorical_features))

# 4. Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

# 5. Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror',
    random_state=42,
    tree_method='gpu_hist',
    predictor='gpu_predictor'))
])

# 6. Detailed hyperparameter grid
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 400, 500, 600, 800, 1000],
    'regressor__max_depth': [3, 5, 7, 9, 11, 13, 15, 17],
    'regressor__learning_rate': [
        0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3
    ],
    'regressor__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'regressor__gamma': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0],
    'regressor__reg_alpha': [0, 0.001, 0.01, 0.05, 0.1, 1, 10, 50, 100],
    'regressor__reg_lambda': [0, 0.001, 0.01, 0.05, 0.1, 1, 10, 50, 100],
    'regressor__min_child_weight': [1, 3, 5, 7, 10],
    'regressor__scale_pos_weight': [1, 5, 10, 20, 50],
    # Optional: add tree method for further experimentation
    'regressor__tree_method': ['auto', 'exact', 'hist', 'approx']
}


# 7. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=50 #42
)

# 8. Randomized Search CV
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=500,
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    random_state=50 #31
)

# 9. Fit the search
search.fit(X_train, y_train)

# 10. Results
print("\n✅ Best parameters found:", search.best_params_)
print("✅ Best CV score (neg MSE):", search.best_score_)

# 11. Evaluate on test set
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print("📉 Test Set MSE:", mean_squared_error(y_test, y_pred))
print("📉 Test Set RMSE:", mean_squared_error(y_test, y_pred) ** 0.5)
print("📈 Test Set R²:", r2_score(y_test, y_pred))

# 12. Save model
joblib.dump(best_model, 'xg_diff.pkl')
print("💾 Model saved as 'xg_diff.pkl'")


Fitting 5 folds for each of 500 candidates, totalling 2500 fits



✅ Best parameters found: {'regressor__tree_method': 'hist', 'regressor__subsample': 0.6, 'regressor__scale_pos_weight': 1, 'regressor__reg_lambda': 0.001, 'regressor__reg_alpha': 0.1, 'regressor__n_estimators': 500, 'regressor__min_child_weight': 1, 'regressor__max_depth': 15, 'regressor__learning_rate': 0.001, 'regressor__gamma': 0.5, 'regressor__colsample_bytree': 0.7}
✅ Best CV score (neg MSE): -1.0406352485145716
📉 Test Set MSE: 0.8845318697086675
📉 Test Set RMSE: 0.9404955447574792
📈 Test Set R²: 0.07515558111054765
💾 Model saved as 'xg_diff.pkl'
