In [23]:
%load_ext autoreload
%autoreload 2

# Control figure size
figsize=(14, 4)

import pandas as pd
from util import util
import os
data_folder = os.path.join('..', 'data')
file_name = "Data"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature Engineering

Up until this point we have only look at two different ways of modelling a team's strength. Now, we want to improve our prediction by adding several more features that could have an effect on predicting the outcome of a match. Now, we use data about goals scored, conceded, points, strength of previous schedule, and more

In [2]:
data = util.load_data(data_folder, file_name)
data = util.get_cleaned_data(data)
data.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,Season
0,E0,2005-08-13,Aston Villa,Bolton,2.0,2.0,D,2.0,2.0,D,...,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,506
1,E0,2005-08-13,Everton,Man United,0.0,2.0,A,0.0,1.0,A,...,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,506
2,E0,2005-08-13,Fulham,Birmingham,0.0,0.0,D,0.0,0.0,D,...,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,506
3,E0,2005-08-13,Man City,West Brom,0.0,0.0,D,0.0,0.0,D,...,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,506
4,E0,2005-08-13,Middlesbrough,Liverpool,0.0,0.0,D,0.0,0.0,D,...,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,506


### Add ELO Ratings

In [3]:
home_factor, draw_factor, away_factor = data['FTR'].value_counts(normalize=True)['H'], data['FTR'].value_counts(normalize=True)['D'], data['FTR'].value_counts(normalize=True)['A']
ELO = util.ELO(data, init_rating=1500, draw_factor=draw_factor, k_factor=32, home_advantage=50)
data = ELO.perform_simulations(data)
data = ELO.get_probabilities(data)

### Add a columns describing results

In [4]:
data = util.add_discrete_result_columns(data)

## Add new features

To better describe the strength difference between the two team we want to add the following features:

- **`Diff_goals_scored`**: The difference in goals scored for the last n matches between the two teams
- **`Diff_goals_conceded`**: The difference in goals conceded for the last n matches between the two teams
- **`Diff_goal_diff`**: The difference in goal difference for the last n matches between the two teams
- **`Diff_points`**: The difference in points for the last n matches between the two teams
- **`Diff_change_in_ELO`**: The difference in change in ELO for the last n matches between the two teams
- **`Diff_opposition_mean_ELO`**: The difference in the mean ELO of the opposition for the last n matches between the two teams
- **`Diff_shots_on_target_attempted`**: The difference in shots on target attempted for the last n matches between the two teams
- **`Diff_shots_on_target_allowed`**: The difference in shots on target allowed for the last n matches between the two teams
- **`Diff_shots_attempted`**: The difference in shots attempted for the last n matches between the two teams
- **`Diff_shots_allowed`**: The difference in shots allowed for the last n matches between the two teams
- **`Diff_corners_awarded`**: The difference in corners awarded for the last n matches between the two teams
- **`Diff_corners_conceded`**: The difference in corners conceded for the last n matches between the two teams
- **`Diff_yellow_cards`**: The difference in yellow cards for the last n matches between the two teams
- **`Diff_red_cards`**: The difference in red cards for the last n matches between the two teams




### Add goals features

In [5]:
#Add goals scored last five games for both home and away team
data = util.add_sequential_column(data, 'FTHG', 'FTAG', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in goals scored last five games between the two teams
data['Diff_goals_scored'] = data['FTHG_Sum_5'] - data['FTAG_Sum_5']



#Add goals conceded last five games for both home and away team
data = util.add_sequential_column(data, 'FTHG', 'FTAG', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in goals conceded last five games between the two teams
data['Diff_goals_conceded'] = data['FTHG_Sum_5_opponent'] - data['FTAG_Sum_5_opponent']



#Add goal difference last five games for home and away team
data['Home Goal Difference last 5'] = data['FTHG_Sum_5'] - data['FTHG_Sum_5_opponent']
data['Away Goal Difference last 5'] = data['FTAG_Sum_5'] - data['FTAG_Sum_5_opponent']

#Add difference in goal difference last five games bwteen the two teams
data['Diff_goal_diff'] = data['Home Goal Difference last 5'] - data['Away Goal Difference last 5']

### Add point feature

In [6]:
#Add points scored last 5 games for both home and away team
data = util.add_sequential_column(data, 'Home', 'Away', n=5, operation='Points', regard_opponent=False, include_current=False)

#Add difference in points last 5 games for home and away team
data['Diff_points'] = data['Home_Points_5'] - data['Away_Points_5']


### Add ELO features

In [7]:
#Add total change in ELO for the last five games for both home and away team
data = util.add_sequential_column(data, 'Home ELO', 'Away ELO', n=5, operation='Change', regard_opponent=False, include_current=True)

#Add difference in ELO-change last 5 games between the two teams
data['Diff_change_in_ELO'] = data['Home ELO_Change_5'] - data['Away ELO_Change_5']



#Add the mean ELO of the opponent for the last five games for both home and away team
data = util.add_sequential_column(data, 'Home ELO', 'Away ELO', n=5, operation='Mean', regard_opponent=True, include_current=False)

#Add difference in mean ELO of the opponent last 5 games between the two teams
data['Diff_opposition_mean_ELO'] = data['Home ELO_Mean_5_opponent'] - data['Away ELO_Mean_5_opponent']

### Add other features

In [8]:
#Add shots on target attempted last five games for both home and away team
data = util.add_sequential_column(data, 'HST', 'AST', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in shots on target attempted last five games between the two teams
data['Diff_shots_on_target_attempted'] = data['HST_Sum_5'] - data['AST_Sum_5']


#Add shots on target allowed last five games for both home and away team
data = util.add_sequential_column(data, 'HST', 'AST', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in shots on target allowed last five games between the two teams
data['Diff_shots_on_target_allowed'] = data['HST_Sum_5_opponent'] - data['AST_Sum_5_opponent']


#Add shots attempted last five games for both home and away team
data = util.add_sequential_column(data, 'HS', 'AS', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in shots attempted last five games between the two teams
data['Diff_shots_attempted'] = data['HS_Sum_5'] - data['AS_Sum_5']


#Add shots allowed last five games for both home and away team
data = util.add_sequential_column(data, 'HS', 'AS', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in shots allowed last five games between the two teams
data['Diff_shots_allowed'] = data['HS_Sum_5_opponent'] - data['AS_Sum_5_opponent']


#Add corners awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HC', 'AC', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in corners awarded last five games between the two teams
data['Diff_corners_awarded'] = data['HC_Sum_5'] - data['AC_Sum_5']


#Add corners conceded last five games for both home and away team
data = util.add_sequential_column(data, 'HC', 'AC', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in corners conceded last five games between the two teams
data['Diff_corners_conceded'] = data['HC_Sum_5_opponent'] - data['AC_Sum_5_opponent']


#Add yellow cards awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HY', 'AY', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in yellow cards awarded last five games between the two teams
data['Diff_yellow_cards'] = data['HY_Sum_5'] - data['AY_Sum_5']


#Add red cards awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HR', 'AR', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in red cards awarded last five games between the two teams
data['Diff_red_cards'] = data['HR_Sum_5'] - data['AR_Sum_5']

### Column inspection

In [9]:
data.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC',
       'AC', 'HY', 'AY', 'HR', 'AR', 'Season', 'Home ELO', 'Away ELO',
       'ELO diff', 'Home_prob_ELO', 'Draw_prob_ELO', 'Away_prob_ELO', 'D', 'A',
       'H', 'FTHG_Sum_5', 'FTAG_Sum_5', 'Diff_goals_scored',
       'FTHG_Sum_5_opponent', 'FTAG_Sum_5_opponent', 'Diff_goals_conceded',
       'Home Goal Difference last 5', 'Away Goal Difference last 5',
       'Diff_goal_diff', 'Home_Points_5', 'Away_Points_5', 'Diff_points',
       'Home ELO_Change_5', 'Away ELO_Change_5', 'Diff_change_in_ELO',
       'Home ELO_Mean_5_opponent', 'Away ELO_Mean_5_opponent',
       'Diff_opposition_mean_ELO', 'HST_Sum_5', 'AST_Sum_5',
       'Diff_shots_on_target_attempted', 'HST_Sum_5_opponent',
       'AST_Sum_5_opponent', 'Diff_shots_on_target_allowed', 'HS_Sum_5',
       'AS_Sum_5', 'Diff_shots_attempted', 'HS_Sum_5_opponent',
       'AS_Sum_5_opp

### Create new dataframe with only the features we want to use

In [10]:
featureset_data_frame = data[['Div','Date', 'HomeTeam', 'AwayTeam', 'H', 'D', 'A', 'FTHG', 'FTAG', 'Home ELO', 'Away ELO', 'ELO diff', 'Referee', 'Home_prob_ELO', 'Draw_prob_ELO', 'Away_prob_ELO', 'Diff_goals_scored', 'Diff_goals_conceded', 'Diff_goal_diff', 'Diff_points', 'Diff_change_in_ELO', 'Diff_opposition_mean_ELO', 'Diff_shots_on_target_attempted', 'Diff_shots_on_target_allowed', 'Diff_shots_attempted', 'Diff_shots_allowed', 'Diff_corners_awarded', 'Diff_corners_conceded', 'Diff_yellow_cards', 'Diff_red_cards', 'Season']]

In [26]:
featureset_data_frame[['H', 'D', 'A', 'ELO diff']].corr()

Unnamed: 0,H,D,A,ELO diff
H,1.0,-0.525784,-0.575155,0.227712
D,-0.525784,1.0,-0.393436,-0.027842
A,-0.575155,-0.393436,1.0,-0.219336
ELO diff,0.227712,-0.027842,-0.219336,1.0


Check the different datatypes we have in our dataframe.

In [21]:
print(featureset_data_frame.dtypes)

Div                                       object
Date                              datetime64[ns]
HomeTeam                                  object
AwayTeam                                  object
H                                           bool
D                                           bool
A                                           bool
FTHG                                     float64
FTAG                                     float64
Home ELO                                  object
Away ELO                                  object
ELO diff                                  object
Referee                                   object
Home_prob_ELO                             object
Draw_prob_ELO                             object
Away_prob_ELO                             object
Diff_goals_scored                         object
Diff_goals_conceded                       object
Diff_goal_diff                            object
Diff_points                               object
Diff_change_in_ELO  

Making columns that should be numbers into numbers.

In [26]:
# List of columns that should be numeric but are currently stored as object
numeric_columns = ['FTHG', 'FTAG', 'Home ELO', 'Away ELO', 'ELO diff', 'Home_prob_ELO', 'Draw_prob_ELO', 'Away_prob_ELO', 'Diff_goals_scored', 'Diff_goals_conceded', 'Diff_goal_diff', 'Diff_points', 'Diff_change_in_ELO', 'Diff_opposition_mean_ELO', 'Diff_shots_on_target_attempted', 'Diff_shots_on_target_allowed', 'Diff_shots_attempted', 'Diff_shots_allowed', 'Diff_corners_awarded', 'Diff_corners_conceded', 'Diff_yellow_cards', 'Diff_red_cards', 'Season']

featureset_data_frame = featureset_data_frame.copy()

# Convert each of these columns to numeric, forcing non-numeric values to NaN
for column in numeric_columns:
    featureset_data_frame[column] = pd.to_numeric(
        featureset_data_frame[column], errors="coerce"
    )

In [37]:
# Select only the numeric columns
numeric_features = featureset_data_frame.select_dtypes(include=[np.number])

correlation_matrix = numeric_features.corr().abs()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Set the correlation threshold
threshold = 0.8

# Create a boolean mask for the upper triangle of the correlation matrix
upper_triangle_mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
upper_triangle_corr = correlation_matrix.where(upper_triangle_mask)

# Step 3: Find features with a correlation higher than the threshold
# This will get the columns that are highly correlated with others
to_drop = [
    column
    for column in upper_triangle_corr.columns
    if any(upper_triangle_corr[column] > threshold)
]

# Step 4: Drop the highly correlated features
data_reduced = numeric_features.drop(columns=to_drop)

# Output the result
print(f"\nNumber of features removed: {len(to_drop)}")
print(f"Remaining features: {data_reduced.shape[1]}")

Correlation Matrix:
                                    FTHG      FTAG  Home ELO  Away ELO  \
FTHG                            1.000000  0.098209  0.295426  0.192573   
FTAG                            0.098209  1.000000  0.174613  0.264267   
Home ELO                        0.295426  0.174613  1.000000  0.002922   
Away ELO                        0.192573  0.264267  0.002922  1.000000   
ELO diff                        0.345591  0.310774  0.706254  0.705892   
Home_prob_ELO                   0.346599  0.302852  0.714881  0.683203   
Draw_prob_ELO                   0.178227  0.039572  0.477429  0.022333   
Away_prob_ELO                   0.326986  0.319486  0.642498  0.740190   
Diff_goals_scored               0.239676  0.200260  0.439573  0.443762   
Diff_goals_conceded             0.166765  0.160027  0.361904  0.333975   
Diff_goal_diff                  0.258772  0.228648  0.508280  0.494268   
Diff_points                     0.233779  0.207427  0.508287  0.499244   
Diff_change_in_ELO