In [32]:
import pandas as pd

In [33]:
df = pd.read_parquet("../data/matches_feature_engineered.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 46 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   round                              2020 non-null   object        
 1   match_date                         2016 non-null   datetime64[ns]
 2   home_team                          2020 non-null   object        
 3   guest_team                         2020 non-null   object        
 4   stadium                            2020 non-null   object        
 5   date_added                         2020 non-null   datetime64[ns]
 6   score_home_team                    2016 non-null   Int64         
 7   score_guest_team                   2016 non-null   Int64         
 8   winning_team                       2016 non-null   object        
 9   is_weekend                         2020 non-null   bool          
 10  match_period                       2

In [34]:
df = df.dropna(subset=['winning_team'])

In [35]:
df['winning_team'].unique()

array(['guest', 'home', 'draw'], dtype=object)

In [36]:
df['target'] = df['winning_team'].map({'home': 0, 'guest': 1, 'draw': 2})

In [37]:
exclude_columns = [
    'round', 'match_date', 'home_team', 'guest_team', 'stadium',
    'date_added', 'score_home_team', 'score_guest_team', 'winning_team', 'target'
]

features = [
    col for col in df.columns if col not in exclude_columns
]
X = df[features]
y = df['target']  # Encoded target

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [39]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categories
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Full pipeline with XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='multi:softmax',  # For multiclass
        num_class=3,  # home, guest, draw
        random_state=42,
        eval_metric='mlogloss'
    ))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate
print("Test Accuracy:", pipeline.score(X_test, y_test))

Test Accuracy: 0.41089108910891087


In [40]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['home', 'guest', 'draw']))

Confusion Matrix:
 [[124  28  38]
 [ 57  19  26]
 [ 63  26  23]]
Classification Report:
               precision    recall  f1-score   support

        home       0.51      0.65      0.57       190
       guest       0.26      0.19      0.22       102
        draw       0.26      0.21      0.23       112

    accuracy                           0.41       404
   macro avg       0.34      0.35      0.34       404
weighted avg       0.38      0.41      0.39       404



In [41]:
# Get feature names from OneHotEncoder
preprocessor.fit(X_train)
onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(onehot_columns)

# Feature importance (after pipeline training)
importances = pipeline.named_steps['classifier'].feature_importances_
pd.DataFrame({'Feature': all_features, 'Importance': importances}).sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
21,home_team_position_difference,0.032187
43,day_of_week_Thursday,0.029882
44,day_of_week_Tuesday,0.027296
13,home_team_current_position,0.025127
27,guest_team_losses_so_far,0.025112
30,home_team_losses_pct_so_far,0.024873
11,guest_team_goal_difference_last_5,0.024624
10,guest_team_goals_conceded_last_5,0.024312
7,guest_team_draws_last_5,0.024125
18,guest_team_goals_scored,0.023945
