In [2]:
# =============================================
# Feature Engineering for Sports Performance Model
# Based on XGBFIR Interaction Analysis (Depth 1-2)
# =============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_csv('/home/kamil/PycharmProjects/predict_football_results/data/preprocessed/preprocessed_1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2975 entries, 0 to 2974
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   match_api_id               2975 non-null   int64  
 1   season                     2975 non-null   object 
 2   stage                      2975 non-null   int64  
 3   date                       2975 non-null   object 
 4   away_team                  2975 non-null   int64  
 5   home_team                  2975 non-null   int64  
 6   result_match               2975 non-null   int64  
 7   points_home                2975 non-null   int64  
 8   points_away                2975 non-null   int64  
 9   home_last_team_goal        2975 non-null   float64
 10  home_last_team_shoton      2975 non-null   float64
 11  home_last_team_possession  2975 non-null   float64
 12  away_last_team_goal        2975 non-null   float64
 13  away_last_team_shoton      2975 non-null   float

In [11]:
# =============================================
# 1. Core Interaction Features from XGBFIR Analysis
# =============================================

# Top 5 two-way interactions (Depth 1)
df['points_diff_accel_home'] = df['points_difference'] * df['team_acceleration_home']
df['accel_home_strength_away'] = df['team_acceleration_home'] / (df['team_strength_away'] + 1e-6)
df['away_possession_strength_away'] = df['away_last_team_possession'] * np.log1p(df['team_strength_away'])
df['aggression_strength_away'] = (df['team_aggression_away'] + df['team_strength_away']) ** 2

# Top 3 three-way interactions (Depth 2)
df['momentum_multiplier'] = (df['points_difference'] *
                            df['team_acceleration_home'] *
                            df['team_strength_home']) ** 0.33

df['possession_efficiency'] = (df['away_last_team_possession'] *
                              df['goal_conversion_rate_away'] /
                              (df['team_acceleration_away'] + 1))

# =============================================
# 2. Split-Based Feature Engineering
# (From Split Value Histograms)
# =============================================

# Points difference segmentation
bins = [-np.inf, -20, -12, 8, 22, np.inf]
labels = ['Deficit Critical', 'Deficit Manageable', 'Neutral',
          'Lead Developing', 'Lead Consolidation']
df['points_diff_segment'] = pd.cut(df['points_difference'],
                                  bins=bins, labels=labels)

# Acceleration differential encoding
df['accel_diff_strength'] = np.where(
    (df['team_acceleration_home'] > 72) &
    (df['team_strength_away'] < 65),
    'HighAccel_LowStrength',
    'Other'
)

# =============================================
# 3. Temporal Aggregation Features
# =============================================

# Possession/strength ratio trends
df['possession_strength_ratio'] = (df['away_last_team_possession'] /
                                   (df['team_strength_away'] + 1e-6))

df['psr_trend_3game'] = df['possession_strength_ratio'].diff(periods=3)

# Save raw engineered features
df.to_csv('../../data/engineered/raw_engineered_features.csv', index=False)

In [6]:
from src.helper import get_split_data

X_trn, y_trn, X_val, y_val, X_tst, y_tst = get_split_data.split_data_for_training(6)

# =============================================
# 4. Polynomial Interaction Features
# =============================================

# Configure polynomial features for top interactions
poly_features = [
    'points_difference',
    'team_acceleration_home',
    'team_strength_away',
    'away_last_team_possession'
]

poly_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2,
                              interaction_only=True,
                              include_bias=False))
])

# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough'
)

# Create preprocessing-only pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform data
X_train_processed = preprocessing_pipeline.fit_transform(X_trn)
X_val_processed = preprocessing_pipeline.transform(X_val)
X_test_processed = preprocessing_pipeline.transform(X_tst)

# Get readable feature names
feature_names = preprocessing_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Create processed DataFrames
trn_df = pd.DataFrame(X_train_processed, columns=feature_names)
val_df = pd.DataFrame(X_val_processed, columns=feature_names)
tst_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Add target variables
trn_df['result_match'] = y_trn.values
val_df['result_match'] = y_val.values
tst_df['result_match'] = y_tst.values

# Save processed datasets
trn_df.to_csv('../../data/engineered/processed_trn_data.csv', index=False)
val_df.to_csv('../../data/engineered/processed_val_data.csv', index=False)
tst_df.to_csv('../../data/engineered/processed_test_data.csv', index=False)

Unnamed: 0,match_api_id,season,stage,date,away_team,home_team,result_match,points_home,points_away,home_last_team_goal,...,accel_home_strength_away,away_possession_strength_away,aggression_strength_away,momentum_multiplier,possession_efficiency,points_diff_segment,accel_diff_strength,rolling_goal_stability,possession_strength_ratio,psr_trend_3game
0,489063,2008/2009,3,2008-08-30,10261,9825,1,3,4,1.00,...,1.198295,232.512340,15979.258264,,0.383154,Neutral,Other,0.000000,0.814266,
1,489068,2008/2009,3,2008-08-30,10194,8549,1,3,3,1.00,...,1.167313,200.586057,18968.801653,0.000000,0.976700,Neutral,Other,0.000000,0.667959,
2,489069,2008/2009,3,2008-08-30,8659,8559,0,3,0,3.00,...,1.213050,268.913767,15876.000000,23.887317,0.291215,Neutral,Other,0.000000,1.141509,
3,489070,2008/2009,3,2008-08-30,8528,8667,0,4,0,1.00,...,1.040583,222.876745,18570.256198,25.773330,0.081901,Neutral,Other,0.000000,0.725428,-0.088838
4,489066,2008/2009,3,2008-08-31,8650,10252,0,3,6,2.00,...,1.001124,202.641038,23622.758494,,0.094792,Neutral,Other,0.564887,0.568752,-0.099206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2970,1987599,2015/2016,38,2016-05-15,9850,8668,1,44,34,1.00,...,1.415766,274.115638,10878.490000,36.310171,0.473436,Lead Developing,HighAccel_LowStrength,0.088166,1.323683,0.497739
2971,1987598,2015/2016,38,2016-05-15,8197,8455,0,49,80,2.00,...,1.160527,279.852406,16432.909174,,0.528884,Deficit Critical,HighAccel_LowStrength,0.117533,1.044205,0.407410
2972,1987597,2015/2016,38,2016-05-15,10252,9825,1,68,17,2.00,...,0.932117,284.624583,22861.440000,61.565692,0.762325,Lead Consolidation,Other,0.195354,0.825444,0.155495
2973,1987601,2015/2016,38,2016-05-15,8586,10261,1,34,70,1.75,...,1.040020,201.657446,20345.132231,,0.101230,Deficit Critical,Other,0.265268,0.652695,-0.670988


In [None]:
# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough'
)

# =============================================
# 5. Final Pipeline Integration
# =============================================

# Example usage with XGBoost
from xgboost import XGBClassifier

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.9
    ))
])

# Sample train-test split
X = df.drop('target_variable', axis=1)  # Replace with your target
y = df['target_variable']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_pipeline.fit(X_train, y_train)