In [1]:
import os
import sys

root_path = os.path.abspath(os.path.join('../..')) # <- adjust such that root_path always points at the root project dir (i.e. if current file is two folders deep, use '../..'). 
if root_path not in sys.path:
    sys.path.append(root_path)

import database_server.db_utilities as dbu 
from cleaning.data_cleaning import DataCleaning
from models.trad_ml.trad_ml import FeatureGen

import pandas as pd

In [2]:
# define feature params_dict
feature_params_dict = {
    'ma_alpha': 0.35,
    'ma_min_periods': 6,
    'ma_restart_each_season': True,

    'min_non_na_share': 0.9,

    'merge_type': 'wide', # how should feature rows of two teams be combined? -> one of ['wide', 'diff_or_ratio']

    'apply_ohe': False, # True -> one-hot encode selected features, False -> drop all categorical features
    'ohe_name': None, # load fitted ohe from file <- must not be None when generating prediction features!

    'tt_split_cutoff_date': None,
    'tt_split_test_season': '2022-2023',

    'apply_scaling': True,
    'scaler_name': None, # load fitted scaler from file <- must not be None when generating prediction features!

    'apply_pca': True,
    'pca_name': None, # load fitted pca from file (provide filename without .pkl suffix) <- must not be None when generating prediction features!
    'pca_n_components': 0.95, # only relevant if not loading fitted pca

    'targets': ['gf', 'ga'], # one of [['gf', 'ga'], ['xg', 'xga']]
    'target_as_diff': True # if True, target is provided as difference between home and away team goals or xg, i.e. only a single target column is returned
}

In [3]:
# test feature generation for training
fg = FeatureGen(feature_params_dict)

X_train, X_test, y_train, y_test, nfc = fg.generate_features(incl_non_feature_cols=True, training=True)

Loaded db data set with shape (21708, 159).
df shape after ma computation: (21708, 162)
df shape after dropping categoricals: (21708, 155)
------------------------------------------------------------
merged_data shape (before dropping duplicates): (10854, 310)
merged_data.shape before dropping aways: (10854, 310)
merged_data.shape after dropping aways: (10854, 302)
merged_data shape (after dropping duplicates): (10854, 300)
n cols ending with '_home': 146
n cols ending with '_away': 146
n cols with neiter '_home' nor '_away': 8
['gf', 'ga', 'season_str', 'league_id', 'team_id', 'opponent_id', 'match_id', 'schedule_date']
------------------------------------------------------------
df shape after merge: (10854, 300)
n rows with any na after merge: 3676
df shape after dropping na rows over na threshold: (10854, 300)
features pre scaling: (10854, 298)
X_train, y_train shape after final na row drop: ((5693, 298), (5693, 2))
X_test, y_test shape after final na row drop: ((1485, 298), (1485,

In [14]:
X_train = X_train[[c for c in X_train.columns if not c in nfc]]
X_test = X_test[[c for c in X_test.columns if not c in nfc]]

In [15]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# Create an instance of the random forest regressor
rf_model_prob = RandomForestClassifier()

# Fit the model to the training data
rf_model_prob.fit(X_train, y_train)
y_pred_prob = rf_model_prob.predict_proba(X_test)

# Concatenate the two lists along the columns axis
combined_prob = np.concatenate(y_pred_prob, axis=1)

# Create a DataFrame from the combined probabilities
df_pred_prob = pd.DataFrame(combined_prob, columns=['Column{}'.format(i+1) for i in range(combined_prob.shape[1])])



# Devide date into two dataframes
goal_prob_home = df_pred_prob.iloc[:, :10]
goal_prob_away = df_pred_prob.iloc[:, 10:]

# Change column names
new_columns = ['prob_{}_goal_home'.format(i+1) for i in range(goal_prob_home.shape[1])]
goal_prob_home.columns = new_columns

new_columns = ['prob_{}_goal_away'.format(i+1) for i in range(goal_prob_away.shape[1])]
goal_prob_away.columns = new_columns

#COMPUTE THE PROBABILITES OF HOMEWIN, AWAYWIN AND DRAW AND ADD TO Y_TEST DF
# Get the number of goals columns for home and away teams
num_goals_home = goal_prob_home.shape[1]
num_goals_away = goal_prob_away.shape[1]

# Create a new data frame to store the results
winning_prob_df = pd.DataFrame(index=goal_prob_home.index)

# Create three arrays for the propabilitis
home_winning_prob = pd.Series(0.0, index=goal_prob_home.index)

away_winning_prob = pd.Series(0.0, index=goal_prob_home.index)

draw_prob = pd.Series(0.0, index=goal_prob_home.index)

# Iterate over all possible goal combinations
for home_goals in range(num_goals_home):
    for away_goals in range(num_goals_away):
        # Home team wins if home goals > away goals
        if home_goals > away_goals:
            home_winning_prob += goal_prob_home.iloc[:, home_goals] * goal_prob_away.iloc[:, away_goals]
        # Away team wins if home goals < away goals
        elif home_goals < away_goals:
            away_winning_prob += goal_prob_home.iloc[:, home_goals] * goal_prob_away.iloc[:, away_goals]
        # Match ends in a draw if home goals = away goals
        else:
            draw_prob += goal_prob_home.iloc[:, home_goals] * goal_prob_away.iloc[:, away_goals]


# Add the probabilities to the y_test
y_test['home_winning_prob'] = home_winning_prob
y_test['draw_prob'] = draw_prob
y_test['away_winning_prob'] = away_winning_prob



In [20]:
y_test['result_home_perspective'] = y_test.apply(lambda x: 'W' if x['gf']>x['ga'] else 'L' if x['gf']<x['ga'] else 'D', axis=1)

y_test = y_test.rename(columns={'draw_prob': 'D',
                        'away_winning_prob': 'L',
                        'home_winning_prob': 'W'})

#Find the column with the maximum probability (H, betD, or betA)
y_test['Prediction'] = y_test[['D', 'L', 'W']].idxmax(axis=1)

#Compare the Prediction column with the FTR column
y_test['Correct'] = y_test['Prediction'] == y_test['result_home_perspective']

lol = y_test[["Prediction","result_home_perspective" ,"Correct"]]

#Calculate the share of true answers
share_true_rf_dif = y_test['Correct'].mean()

#Print the share of true answers
print(share_true_rf_dif)

0.5057239057239057


In [24]:
#get feature importance and stor them into a data frame.
feature_importances_rf = rf_model_prob.feature_importances_
feature_names_rf = X_train.columns
df_feature_importances_rf = pd.DataFrame({'Feature': feature_names_rf, 'Importance': feature_importances_rf})

#sort the DataFrame by importance values in descending order
df_feature_importances_rf = df_feature_importances_rf.sort_values(by='Importance', ascending=False)
print(df_feature_importances_rf)

   Feature  Importance
1        1    0.019370
0        0    0.009205
5        5    0.008886
51      51    0.008477
10      10    0.008469
..     ...         ...
22      22    0.007570
26      26    0.007541
96      96    0.007539
15      15    0.007495
33      33    0.007487

[123 rows x 2 columns]


Unnamed: 0,gf,ga,home_winning_prob,draw_prob,away_winning_prob,result
0,1,0,0.3056,0.2299,0.4645,W
1,1,0,0.4727,0.2178,0.3095,W
2,1,2,0.6032,0.1933,0.2035,L
3,3,1,0.5293,0.2418,0.2289,W
4,2,1,0.6511,0.1792,0.1697,W
...,...,...,...,...,...,...
1480,0,1,0.4773,0.2597,0.2630,L
1481,3,3,0.2611,0.2600,0.4789,D
1482,2,1,0.4935,0.2333,0.2732,W
1483,1,1,0.5189,0.2415,0.2396,D


In [16]:
y_test

Unnamed: 0,gf,ga,home_winning_prob,draw_prob,away_winning_prob
0,1,0,0.3056,0.2299,0.4645
1,1,0,0.4727,0.2178,0.3095
2,1,2,0.6032,0.1933,0.2035
3,3,1,0.5293,0.2418,0.2289
4,2,1,0.6511,0.1792,0.1697
...,...,...,...,...,...
1480,0,1,0.4773,0.2597,0.2630
1481,3,3,0.2611,0.2600,0.4789
1482,2,1,0.4935,0.2333,0.2732
1483,1,1,0.5189,0.2415,0.2396
