In [1]:
# =============================================
# Feature Engineering for Sports Performance Model
# Based on XGBFIR Interaction Analysis (Depth 1-2)
# =============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('/home/kamil/PycharmProjects/predict_football_results/data/preprocessed/preprocessed_1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2975 entries, 0 to 2974
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   match_api_id                    2975 non-null   int64  
 1   season                          2975 non-null   object 
 2   stage                           2975 non-null   int64  
 3   date                            2975 non-null   object 
 4   away_team                       2975 non-null   int64  
 5   home_team                       2975 non-null   int64  
 6   result_match                    2975 non-null   int64  
 7   points_home                     2975 non-null   int64  
 8   points_away                     2975 non-null   int64  
 9   home_last_team_goal             2975 non-null   float64
 10  home_last_team_shoton           2975 non-null   float64
 11  home_last_team_possession       2975 non-null   float64
 12  away_last_team_goal             29

In [3]:
# =============================================
# 1. Core Interaction Features from XGBFIR Analysis
# =============================================

# Top 5 two-way interactions (Depth 1)
df['points_diff_accel_home'] = df['points_difference'] * df['team_acceleration_home'] * df['team_strength_home']
df['points_diff_accel_away'] = df['points_difference'] * df['team_acceleration_away'] * df['team_strength_away']

df['accel_home_strength_home'] = df['team_acceleration_home'] / (df['team_strength_home'] + 1e-6)
df['accel_home_strength_away'] = df['team_acceleration_away'] / (df['team_strength_away'] + 1e-6)

df['home_possession_strength_home'] = df['home_last_team_possession'] * np.log1p(df['team_strength_home'])
df['away_possession_strength_away'] = df['away_last_team_possession'] * np.log1p(df['team_strength_away'])

df['aggression_physician_home'] = (df['team_aggression_home'] + df['team_strength_home'] + df['team_acceleration_home'])
df['aggression_physician_away'] = (df['team_aggression_away'] + df['team_strength_away'] + df['team_acceleration_away'])

# Top 3 three-way interactions (Depth 2)
df['possession_efficiency_away'] = (df['away_last_team_possession'] *
                              df['goal_conversion_rate_away'] /
                              (df['team_acceleration_away'] + 1))

df['possession_efficiency_home'] = (df['home_last_team_possession'] *
                              df['goal_conversion_rate_home'] /
                              (df['team_acceleration_home'] + 1))

df['points_diff_accel_home_x_goals_ratio_away'] = df['points_diff_accel_home'] * df['rolling_avg_goals_ratio_away']
df['points_diff_accel_home_x_accel_home'] = df['points_diff_accel_home'] * df['team_acceleration_home']
df['points_diff_accel_away_x_points_diff_accel_home'] = df['points_diff_accel_away'] * df['points_diff_accel_home']
df['away_shoton_x_points_diff_accel_home'] = df['away_last_team_shoton'] * df['points_diff_accel_home']
df['points_diff_accel_away_x_goals_ratio_away'] = df['points_diff_accel_away'] * df['rolling_avg_goals_ratio_away']

df['rolling_goals_home_x_stability_home'] = df['rolling_avg_goals_home'] * df['rolling_goal_stability_home']
df['accel_home_x_strength_away'] = df['team_acceleration_home'] / (df['team_strength_away'] + 1e-6)
df['away_possession_x_strength_away'] = df['away_last_team_possession'] * df['team_strength_away']
df['aggression_away_x_strength_away'] = df['team_aggression_away'] * df['team_strength_away']

# =============================================
# 2. Create Three-Way Interactions
# =============================================

# Top 3 three-way interactions
df['pdiff_accel_away_x_home_x_goals_ratio'] = (
    df['points_diff_accel_away'] *
    df['points_diff_accel_home'] *
    df['rolling_avg_goals_ratio_away']
)

df['pdiff_accel_home_x_goals_away_x_ratio'] = (
    df['points_diff_accel_home'] *
    df['rolling_avg_goals_away'] *
    df['rolling_avg_goals_ratio_away']
)

df['accel_diff_x_away_shoton_x_pdiff_accel_home'] = (
    df['acceleration_difference'] *
    df['away_last_team_shoton'] *
    df['points_diff_accel_home']
)

In [4]:
# =============================================
# 2. Split-Based Feature Engineering
# (From Split Value Histograms)
# =============================================

# Points difference segmentation
bins = [-np.inf, -20, -12, 8, 22, np.inf]
labels = ['Deficit Critical', 'Deficit Manageable', 'Neutral',
          'Lead Developing', 'Lead Consolidation']
df['points_diff_segment'] = pd.cut(df['points_difference'], bins=bins, labels=labels)

oe = OrdinalEncoder(categories=[labels])
df['points_diff_segment_encoded'] = oe.fit_transform(df[['points_diff_segment']])
df.drop(columns=['points_diff_segment'], inplace=True)

# Based on Split Value Histograms
def categorize_points_diff_accel_home(val):
    if val <= -24000:
        return 0
    elif val <= 4800:
        return 1
    elif val <= 31000:
        return 2
    else:
        return 3

def categorize_team_acceleration_home(val):
    if val <= 70.7:
        return 0
    elif val <= 77.3:
        return 1
    elif val <= 80.3:
        return 2
    else:
        return 3

df['points_diff_accel_home_cat'] = df['points_diff_accel_home'].apply(categorize_points_diff_accel_home)
df['team_acceleration_home_cat'] = df['team_acceleration_home'].apply(categorize_team_acceleration_home)

# Save raw engineered features
df.to_csv('../../data/engineered/raw_engineered_features.csv', index=False)

In [5]:
# Calculate the cross-tabulated distribution
distribution_relation = pd.crosstab(df['points_diff_segment_encoded'], df['result_match'], normalize='index')

# Display the result
print(distribution_relation)

result_match                        0         1
points_diff_segment_encoded                    
0.0                          0.760369  0.239631
1.0                          0.731343  0.268657
2.0                          0.567267  0.432733
3.0                          0.368794  0.631206
4.0                          0.167598  0.832402


In [6]:
from src.helper import get_split_data

X_trn, y_trn, X_val, y_val, X_tst, y_tst = get_split_data.split_data_for_training(6)

# =============================================
# 4. Polynomial Interaction Features
# =============================================

# Configure polynomial features for top interactions
poly_features = [
    'points_difference',
    'team_acceleration_home',
    'team_strength_away',
    'away_last_team_possession'
]

poly_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2,
                              interaction_only=True,
                              include_bias=False))
])

# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough'
)

# Create preprocessing-only pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform data
X_train_processed = preprocessing_pipeline.fit_transform(X_trn)
X_val_processed = preprocessing_pipeline.transform(X_val)
X_test_processed = preprocessing_pipeline.transform(X_tst)

# Get readable feature names
feature_names = preprocessing_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Create processed DataFrames
trn_df = pd.DataFrame(X_train_processed, columns=feature_names)
val_df = pd.DataFrame(X_val_processed, columns=feature_names)
tst_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Add target variables
trn_df['result_match'] = y_trn.values
val_df['result_match'] = y_val.values
tst_df['result_match'] = y_tst.values

# Save processed datasets
trn_df.to_csv('../../data/engineered/processed_trn_data.csv', index=False)
val_df.to_csv('../../data/engineered/processed_val_data.csv', index=False)
tst_df.to_csv('../../data/engineered/processed_test_data.csv', index=False)

In [7]:
trn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2391 entries, 0 to 2390
Data columns (total 59 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   poly__points_difference                                     2391 non-null   float64
 1   poly__team_acceleration_home                                2391 non-null   float64
 2   poly__team_strength_away                                    2391 non-null   float64
 3   poly__away_last_team_possession                             2391 non-null   float64
 4   poly__points_difference team_acceleration_home              2391 non-null   float64
 5   poly__points_difference team_strength_away                  2391 non-null   float64
 6   poly__points_difference away_last_team_possession           2391 non-null   float64
 7   poly__team_acceleration_home team_strength_away             2391 non-null   float64
 8 

In [8]:
trn_df.head()

Unnamed: 0,poly__points_difference,poly__team_acceleration_home,poly__team_strength_away,poly__away_last_team_possession,poly__points_difference team_acceleration_home,poly__points_difference team_strength_away,poly__points_difference away_last_team_possession,poly__team_acceleration_home team_strength_away,poly__team_acceleration_home away_last_team_possession,poly__team_strength_away away_last_team_possession,...,remainder__accel_home_x_strength_away,remainder__away_possession_x_strength_away,remainder__aggression_away_x_strength_away,remainder__pdiff_accel_away_x_home_x_goals_ratio,remainder__pdiff_accel_home_x_goals_away_x_ratio,remainder__accel_diff_x_away_shoton_x_pdiff_accel_home,remainder__points_diff_segment_encoded,remainder__points_diff_accel_home_cat,remainder__team_acceleration_home_cat,result_match
0,0.0,80.909091,67.515152,55.0,0.0,0.0,0.0,5462.589532,4450.0,3713.333333,...,1.198384,3713.333333,4891.779614,0.0,0.0,0.0,2.0,1.0,3.0,0
1,0.0,74.681818,71.980303,50.0,0.0,0.0,0.0,5375.619904,3734.090909,3599.015152,...,1.037531,3599.015152,4338.012929,0.0,0.0,0.0,2.0,1.0,1.0,0
2,0.0,81.568182,58.454545,51.0,0.0,0.0,0.0,4768.030992,4159.977273,2981.181818,...,1.395412,2981.181818,2950.360331,0.0,0.0,-0.0,2.0,1.0,3.0,0
3,0.0,73.772727,78.898485,43.0,0.0,0.0,0.0,5820.556405,3172.227273,3392.634848,...,0.935033,3392.634848,6294.784116,0.0,0.0,-0.0,2.0,1.0,1.0,0
4,0.0,72.75303,60.716667,52.0,0.0,0.0,0.0,4417.32149,3783.157576,3157.266667,...,1.198238,3157.266667,3931.404167,0.0,0.0,-0.0,2.0,1.0,1.0,0


In [9]:
# Apply to selected columns
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', poly_transformer, poly_features)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)

# =============================================
# 5. Final Pipeline Integration
# =============================================

# Example usage with XGBoost
from xgboost import XGBClassifier

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.9
    ))
])

# Sample train-test split
X = df.drop('result_match', axis=1)  # Replace with your target
y = df['result_match']

model_pipeline.fit(X_trn, y_trn)