In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

In [2]:
def evaluate_model(X_train, X_val, y_train, y_val):
    encoder = ce.OneHotEncoder(cols=X_train.select_dtypes(include=['object', 'category']).columns, use_cat_names=True, drop_invariant=True, return_df=True)

    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_val_encoded = encoder.transform(X_val)
    
    model = XGBClassifier(random_state=42, enable_categorical=True, early_stopping_rounds=100)
    model.fit(X_train_encoded, y_train, eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)], verbose=0)
    y_pred = model.predict(X_val_encoded)
    return f1_score(y_val, y_pred, average='weighted')

In [3]:
X = pd.read_csv('../../data/binned/df.csv')
y = pd.read_csv('../../data/binned/y.csv')

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   avg_home_rating_attack_binned        3040 non-null   category
 1   avg_away_rating_attack_binned        3040 non-null   category
 2   avg_away_rating_defence_binned       3040 non-null   category
 3   avg_home_rating_defence_binned       3040 non-null   category
 4   points_home_binned                   3040 non-null   category
 5   home_weighted_wins_binned            3040 non-null   category
 6   away_weighted_wins_binned            3040 non-null   category
 7   ewm_home_team_goals_binned           3024 non-null   category
 8   ewm_away_team_goals_conceded_binned  3022 non-null   category
dtypes: category(9)
memory usage: 28.6 KB


In [4]:
df = X.copy()

In [5]:
epsilon = 0.0001

# df['ewm_shoton_diff'] = df['ewm_shoton_home'] - df['ewm_shoton_away']
df['ewm_shoton_ratio'] = df['ewm_shoton_home'] / (df['ewm_shoton_away'] + epsilon)

In [6]:
df.drop(['ewm_shoton_home', 'ewm_shoton_away'], inplace=True, axis=1)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
evaluate_model(X_train, X_val, y_train, y_val)

0.4816862661207249

In [6]:
df_ = df.copy()

In [7]:
df_['ewm_shoton_diff'] = df_['ewm_shoton_home'] - df_['ewm_shoton_away']
df_['ewm_shoton_ratio'] = df_['ewm_shoton_home'] / df_['ewm_shoton_away']

In [14]:
import statsmodels.api as sm

df_ = df[['ewm_shoton_ratio', 'ewm_shoton_home', 'ewm_shoton_away']].copy()# Predictor variables # Target variable

y_df = pd.DataFrame(y, columns=['result_match'])
combined_df = pd.concat([df_, y_df], axis=1)

combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with missing values in either features or target
combined_df = combined_df.dropna()

# Split the combined DataFrame back into features and target
df_clean = combined_df.drop('result_match', axis=1)
y_clean = combined_df['result_match']

X_train, X_val, y_train, y_val = train_test_split(df_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean)

X = sm.add_constant(X_train)  # Adding a constant
model = sm.OLS(y_train, X_train).fit()

In [16]:
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:           result_match   R-squared (uncentered):                   0.682
Model:                            OLS   Adj. R-squared (uncentered):              0.682
Method:                 Least Squares   F-statistic:                              1716.
Date:                Sat, 27 Jan 2024   Prob (F-statistic):                        0.00
Time:                        18:07:02   Log-Likelihood:                         -2908.5
No. Observations:                2399   AIC:                                      5823.
Df Residuals:                    2396   BIC:                                      5840.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

In [None]:
df_clean

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2404 entries, 2730 to 304
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ewm_shoton_ratio  2404 non-null   float64
 1   ewm_shoton_home   2404 non-null   float64
 2   ewm_shoton_away   2404 non-null   float64
dtypes: float64(3)
memory usage: 75.1 KB


In [11]:
X_train.dropna()

Unnamed: 0,ewm_shoton_ratio,ewm_shoton_home,ewm_shoton_away
2730,0.896533,4.939,5.509
1476,1.355184,5.189,3.829
1428,1.969080,7.642,3.881
482,1.502589,8.416,5.601
1781,1.448695,7.384,5.097
...,...,...,...
313,1.123301,6.778,6.034
2892,1.568541,6.751,4.304
1834,1.258340,7.355,5.845
2832,0.842154,5.490,6.519
