In [147]:
import pandas as pd

In [148]:
df = pd.read_parquet("../data/matches_feature_engineered.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   round                               2020 non-null   object        
 1   match_date                          2016 non-null   datetime64[ns]
 2   home_team                           2020 non-null   object        
 3   guest_team                          2020 non-null   object        
 4   stadium                             2020 non-null   object        
 5   date_added                          2020 non-null   datetime64[ns]
 6   score_home_team                     2016 non-null   Int64         
 7   score_guest_team                    2016 non-null   Int64         
 8   winning_team                        2016 non-null   object        
 9   is_weekend                          2020 non-null   bool          
 10  match_period            

In [149]:
df = df.dropna(subset=['winning_team'])

In [150]:
df['winning_team'].unique()

array(['guest', 'home', 'draw'], dtype=object)

In [151]:
df['target'] = df['winning_team'].map({'home': 0, 'guest': 1, 'draw': 2})

In [152]:
exclude_columns = [
    'round', 'match_date', 'home_team', 'guest_team', 'stadium', 'year',
    'date_added', 'score_home_team', 'score_guest_team', 'winning_team', 'target', 'day_of_week', 'match_period', 'is_weekend'
]

features = [
    col for col in df.columns if col not in exclude_columns
]
X = df[features]
y = df['target']  # Encoded target

In [153]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2016 entries, 0 to 2015
Data columns (total 38 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   home_team_wins_last_5               2016 non-null   int64  
 1   home_team_draws_last_5              2016 non-null   int64  
 2   home_team_loses_last_5              2016 non-null   int64  
 3   home_team_goals_scored_last_5       2016 non-null   int64  
 4   home_team_goals_conceded_last_5     2016 non-null   int64  
 5   home_team_goal_difference_last_5    2016 non-null   int64  
 6   guest_team_wins_last_5              2016 non-null   int64  
 7   guest_team_draws_last_5             2016 non-null   int64  
 8   guest_team_loses_last_5             2016 non-null   int64  
 9   guest_team_goals_scored_last_5      2016 non-null   int64  
 10  guest_team_goals_conceded_last_5    2016 non-null   int64  
 11  guest_team_goal_difference_last_5   2016 non-nul

In [154]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [155]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values
])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categories
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
# ])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        # ('cat', categorical_transformer, categorical_cols)
    ])

# Full pipeline with XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='multi:softmax',  # For multiclass
        num_class=3,  # home, guest, draw
        random_state=42,
        eval_metric='mlogloss'
    ))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate
print("Test Accuracy:", pipeline.score(X_test, y_test))

Test Accuracy: 0.41089108910891087


In [156]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['home', 'guest', 'draw']))

Confusion Matrix:
 [[123  23  44]
 [ 58  19  25]
 [ 60  28  24]]
Classification Report:
               precision    recall  f1-score   support

        home       0.51      0.65      0.57       190
       guest       0.27      0.19      0.22       102
        draw       0.26      0.21      0.23       112

    accuracy                           0.41       404
   macro avg       0.35      0.35      0.34       404
weighted avg       0.38      0.41      0.39       404



In [157]:
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [158]:
# Get feature names from OneHotEncoder
preprocessor.fit(X_train)

all_features = numerical_cols
if categorical_cols:
    onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    onehot_columns = onehot_columns.tolist()
    all_features += onehot_columns

# Feature importance (after pipeline training)
importances = pipeline.named_steps['classifier'].feature_importances_
pd.DataFrame({'Feature': all_features, 'Importance': importances}).sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
20,home_team_position_difference,0.040711
35,wins_difference,0.031485
24,home_team_draws_so_far,0.031391
12,home_team_current_position,0.030558
27,guest_team_draws_so_far,0.030421
25,home_team_losses_so_far,0.03004
37,losses_difference,0.029989
23,home_team_wins_so_far,0.029224
3,home_team_goals_scored_last_5,0.028423
22,home_team_goal_conceded_difference,0.028093
