In [1]:
# =============================================
# Feature Engineering for Sports Performance Model
# Based on XGBFIR Interaction Analysis (Depth 1-2)
# =============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('/home/kamil/PycharmProjects/predict_football_results/data/preprocessed/preprocessed_1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2975 entries, 0 to 2974
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   match_api_id               2975 non-null   int64  
 1   season                     2975 non-null   object 
 2   stage                      2975 non-null   int64  
 3   date                       2975 non-null   object 
 4   away_team                  2975 non-null   int64  
 5   home_team                  2975 non-null   int64  
 6   result_match               2975 non-null   int64  
 7   points_home                2975 non-null   int64  
 8   points_away                2975 non-null   int64  
 9   home_last_team_goal        2975 non-null   float64
 10  home_last_team_shoton      2975 non-null   float64
 11  home_last_team_possession  2975 non-null   float64
 12  away_last_team_goal        2975 non-null   float64
 13  away_last_team_shoton      2975 non-null   float

In [3]:
# =============================================
# 1. Core Interaction Features from XGBFIR Analysis
# =============================================

# Top 5 two-way interactions (Depth 1)
df['points_diff_accel_home'] = df['points_difference'] * df['team_acceleration_home']
df['accel_home_strength_away'] = df['team_acceleration_home'] / (df['team_strength_away'] + 1e-6)
df['away_possession_strength_away'] = df['away_last_team_possession'] * np.log1p(df['team_strength_away'])
df['aggression_strength_away'] = (df['team_aggression_away'] + df['team_strength_away']) ** 2

# Top 3 three-way interactions (Depth 2)
df['possession_efficiency'] = (df['away_last_team_possession'] *
                              df['goal_conversion_rate_away'] /
                              (df['team_acceleration_away'] + 1))

# =============================================
# 2. Split-Based Feature Engineering
# (From Split Value Histograms)
# =============================================

# Points difference segmentation
bins = [-np.inf, -20, -12, 8, 22, np.inf]
labels = ['Deficit Critical', 'Deficit Manageable', 'Neutral',
          'Lead Developing', 'Lead Consolidation']
df['points_diff_segment'] = pd.cut(df['points_difference'],
                                  bins=bins, labels=labels)

oe = OrdinalEncoder(categories=[labels])
df['points_diff_segment_encoded'] = oe.fit_transform(df[['points_diff_segment']])
df.drop(columns=['points_diff_segment'], inplace=True)

# Acceleration differential encoding
df['accel_diff_strength'] = np.where(
    (df['team_acceleration_home'] > 72) &
    (df['team_strength_away'] < 65),
    'HighAccel_LowStrength',
    'Other'
)

df = pd.get_dummies(df, columns=['accel_diff_strength'], prefix='accel_diff_strength', drop_first=True)

# =============================================
# 3. Temporal Aggregation Features
# =============================================

# Possession/strength ratio trends
df['possession_strength_ratio'] = (df['away_last_team_possession'] / (df['team_strength_away'] + 1e-6))

df['psr_trend_3game'] = df['possession_strength_ratio'].diff(periods=3).interpolate(method='linear', limit_direction='both')

# Save raw engineered features
df.to_csv('../../data/engineered/raw_engineered_features.csv', index=False)

In [4]:
from src.helper import get_split_data

X_trn, y_trn, X_val, y_val, X_tst, y_tst = get_split_data.split_data_for_training(6)

# =============================================
# 4. Polynomial Interaction Features
# =============================================

# Configure polynomial features for top interactions
poly_features = [
    'points_difference',
    'team_acceleration_home',
    'team_strength_away',
    'away_last_team_possession'
]

poly_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2,
                              interaction_only=True,
                              include_bias=False))
])

# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough'
)

# Create preprocessing-only pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform data
X_train_processed = preprocessing_pipeline.fit_transform(X_trn)
X_val_processed = preprocessing_pipeline.transform(X_val)
X_test_processed = preprocessing_pipeline.transform(X_tst)

# Get readable feature names
feature_names = preprocessing_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Create processed DataFrames
trn_df = pd.DataFrame(X_train_processed, columns=feature_names)
val_df = pd.DataFrame(X_val_processed, columns=feature_names)
tst_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Add target variables
trn_df['result_match'] = y_trn.values
val_df['result_match'] = y_val.values
tst_df['result_match'] = y_tst.values

# Save processed datasets
trn_df.to_csv('../../data/engineered/processed_trn_data.csv', index=False)
val_df.to_csv('../../data/engineered/processed_val_data.csv', index=False)
tst_df.to_csv('../../data/engineered/processed_test_data.csv', index=False)

In [5]:
trn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2391 entries, 0 to 2390
Data columns (total 38 columns):
 #   Column                                                  Non-Null Count  Dtype 
---  ------                                                  --------------  ----- 
 0   poly__points_difference                                 2391 non-null   object
 1   poly__team_acceleration_home                            2391 non-null   object
 2   poly__team_strength_away                                2391 non-null   object
 3   poly__away_last_team_possession                         2391 non-null   object
 4   poly__points_difference team_acceleration_home          2391 non-null   object
 5   poly__points_difference team_strength_away              2391 non-null   object
 6   poly__points_difference away_last_team_possession       2391 non-null   object
 7   poly__team_acceleration_home team_strength_away         2391 non-null   object
 8   poly__team_acceleration_home away_last_team_poss

In [6]:
trn_df.head()

Unnamed: 0,poly__points_difference,poly__team_acceleration_home,poly__team_strength_away,poly__away_last_team_possession,poly__points_difference team_acceleration_home,poly__points_difference team_strength_away,poly__points_difference away_last_team_possession,poly__team_acceleration_home team_strength_away,poly__team_acceleration_home away_last_team_possession,poly__team_strength_away away_last_team_possession,...,remainder__points_diff_accel_home,remainder__accel_home_strength_away,remainder__away_possession_strength_away,remainder__aggression_strength_away,remainder__possession_efficiency,remainder__points_diff_segment_encoded,remainder__accel_diff_strength_Other,remainder__possession_strength_ratio,remainder__psr_trend_3game,result_match
0,0.0,80.909091,67.515152,55.0,0.0,0.0,0.0,5462.589532,4450.0,3713.333333,...,0.0,1.198384,232.48802,19591.51607,0.175058,2.0,True,0.814632,-0.010732,0
1,0.0,74.681818,71.980303,50.0,0.0,0.0,0.0,5375.619904,3734.090909,3599.015152,...,0.0,1.037531,214.509479,17489.260994,0.242057,2.0,True,0.694634,-0.148314,0
2,0.0,81.568182,58.454545,51.0,0.0,0.0,0.0,4768.030992,4159.977273,2981.181818,...,0.0,1.395412,208.345816,11865.150744,0.11659,2.0,False,0.872473,-0.046209,0
3,0.0,73.772727,78.898485,43.0,0.0,0.0,0.0,5820.556405,3172.227273,3392.634848,...,0.0,0.935033,188.372546,25179.919421,0.119068,2.0,True,0.545004,-0.269628,0
4,0.0,72.75303,60.716667,52.0,0.0,0.0,0.0,4417.32149,3783.157576,3157.266667,...,0.0,1.198238,214.372809,15741.884444,0.20501,2.0,False,0.856437,0.161803,0


In [7]:
# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)

# =============================================
# 5. Final Pipeline Integration
# =============================================

# Example usage with XGBoost
from xgboost import XGBClassifier

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.9
    ))
])

# Sample train-test split
X = df.drop('result_match', axis=1)  # Replace with your target
y = df['result_match']

model_pipeline.fit(X_trn, y_trn)