In [43]:
%load_ext autoreload
%autoreload 2

# Control figure size
figsize=(14, 4)

import pandas as pd
from util import util
import numpy as np
import os
data_folder = os.path.join('..', 'data')
file_name = "Data"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature Engineering

Up until this point we have only look at two different ways of modelling a team's strength. Now, we want to improve our prediction by adding several more features that could have an effect on predicting the outcome of a match. Now, we use data about goals scored, conceded, points, strength of previous schedule, and more

In [121]:
data = util.load_data(data_folder, file_name)
data = util.get_cleaned_data(data)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,Season
0,E0,2005-08-13,Aston Villa,Bolton,2.0,2.0,D,2.0,2.0,D,...,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,506
1,E0,2005-08-13,Everton,Man United,0.0,2.0,A,0.0,1.0,A,...,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,506
2,E0,2005-08-13,Fulham,Birmingham,0.0,0.0,D,0.0,0.0,D,...,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,506
3,E0,2005-08-13,Man City,West Brom,0.0,0.0,D,0.0,0.0,D,...,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,506
4,E0,2005-08-13,Middlesbrough,Liverpool,0.0,0.0,D,0.0,0.0,D,...,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,506


### Add ELO Ratings

In [122]:
home_factor, draw_factor, away_factor = data['FTR'].value_counts(normalize=True)['H'], data['FTR'].value_counts(normalize=True)['D'], data['FTR'].value_counts(normalize=True)['A']
ELO = util.ELO(data, init_rating=1500, draw_factor=draw_factor, k_factor=32, home_advantage=50)
data = ELO.perform_simulations(data)
data = ELO.get_probabilities(data)

### Add a columns describing results

In [123]:
data = util.add_discrete_result_columns(data)

## Add new features

To better describe the strength difference between the two team we want to add the following features:

- **`Diff_goals_scored`**: The difference in goals scored for the last n matches between the two teams
- **`Diff_goals_conceded`**: The difference in goals conceded for the last n matches between the two teams
- **`Diff_goal_diff`**: The difference in goal difference for the last n matches between the two teams
- **`Diff_points`**: The difference in points for the last n matches between the two teams
- **`Diff_change_in_ELO`**: The difference in change in ELO for the last n matches between the two teams
- **`Diff_opposition_mean_ELO`**: The difference in the mean ELO of the opposition for the last n matches between the two teams
- **`Diff_shots_on_target_attempted`**: The difference in shots on target attempted for the last n matches between the two teams
- **`Diff_shots_on_target_allowed`**: The difference in shots on target allowed for the last n matches between the two teams
- **`Diff_shots_attempted`**: The difference in shots attempted for the last n matches between the two teams
- **`Diff_shots_allowed`**: The difference in shots allowed for the last n matches between the two teams
- **`Diff_corners_awarded`**: The difference in corners awarded for the last n matches between the two teams
- **`Diff_corners_conceded`**: The difference in corners conceded for the last n matches between the two teams
- **`Diff_yellow_cards`**: The difference in yellow cards for the last n matches between the two teams
- **`Diff_red_cards`**: The difference in red cards for the last n matches between the two teams




### Add goals features

In [124]:
#Add goals scored last five games for both home and away team
data = util.add_sequential_column(data, 'FTHG', 'FTAG', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in goals scored last five games between the two teams
data['Diff_goals_scored'] = data['FTHG_Sum_5'] - data['FTAG_Sum_5']



#Add goals conceded last five games for both home and away team
data = util.add_sequential_column(data, 'FTHG', 'FTAG', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in goals conceded last five games between the two teams
data['Diff_goals_conceded'] = data['FTHG_Sum_5_opponent'] - data['FTAG_Sum_5_opponent']



#Add goal difference last five games for home and away team
data['Home Goal Difference last 5'] = data['FTHG_Sum_5'] - data['FTHG_Sum_5_opponent']
data['Away Goal Difference last 5'] = data['FTAG_Sum_5'] - data['FTAG_Sum_5_opponent']

#Add difference in goal difference last five games bwteen the two teams
data['Matchrating'] = data['Home Goal Difference last 5'] - data['Away Goal Difference last 5']

### Add point feature

In [125]:
#Add points scored last 5 games for both home and away team
data = util.add_sequential_column(data, 'Home', 'Away', n=5, operation='Points', regard_opponent=False, include_current=False)

#Add difference in points last 5 games for home and away team
data['Diff_points'] = data['Home_Points_5'] - data['Away_Points_5']


### Add ELO features

In [126]:
#Add total change in ELO for the last five games for both home and away team
data = util.add_sequential_column(data, 'Home ELO', 'Away ELO', n=5, operation='Change', regard_opponent=False, include_current=True)

#Add difference in ELO-change last 5 games between the two teams
data['Diff_change_in_ELO'] = data['Home ELO_Change_5'] - data['Away ELO_Change_5']



#Add the mean ELO of the opponent for the last five games for both home and away team
data = util.add_sequential_column(data, 'Home ELO', 'Away ELO', n=5, operation='Mean', regard_opponent=True, include_current=False)

#Add difference in mean ELO of the opponent last 5 games between the two teams
data['Diff_opposition_mean_ELO'] = data['Home ELO_Mean_5_opponent'] - data['Away ELO_Mean_5_opponent']

### Add other features

In [127]:
#Add shots on target attempted last five games for both home and away team
data = util.add_sequential_column(data, 'HST', 'AST', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in shots on target attempted last five games between the two teams
data['Diff_shots_on_target_attempted'] = data['HST_Sum_5'] - data['AST_Sum_5']


#Add shots on target allowed last five games for both home and away team
data = util.add_sequential_column(data, 'HST', 'AST', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in shots on target allowed last five games between the two teams
data['Diff_shots_on_target_allowed'] = data['HST_Sum_5_opponent'] - data['AST_Sum_5_opponent']


#Add shots attempted last five games for both home and away team
data = util.add_sequential_column(data, 'HS', 'AS', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in shots attempted last five games between the two teams
data['Diff_shots_attempted'] = data['HS_Sum_5'] - data['AS_Sum_5']


#Add shots allowed last five games for both home and away team
data = util.add_sequential_column(data, 'HS', 'AS', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in shots allowed last five games between the two teams
data['Diff_shots_allowed'] = data['HS_Sum_5_opponent'] - data['AS_Sum_5_opponent']


#Add corners awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HC', 'AC', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in corners awarded last five games between the two teams
data['Diff_corners_awarded'] = data['HC_Sum_5'] - data['AC_Sum_5']


#Add corners conceded last five games for both home and away team
data = util.add_sequential_column(data, 'HC', 'AC', n=5, operation='Sum', regard_opponent=True, include_current=False)

#Add difference in corners conceded last five games between the two teams
data['Diff_corners_conceded'] = data['HC_Sum_5_opponent'] - data['AC_Sum_5_opponent']


#Add yellow cards awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HY', 'AY', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in yellow cards awarded last five games between the two teams
data['Diff_yellow_cards'] = data['HY_Sum_5'] - data['AY_Sum_5']


#Add red cards awarded last five games for both home and away team
data = util.add_sequential_column(data, 'HR', 'AR', n=5, operation='Sum', regard_opponent=False, include_current=False)

#Add difference in red cards awarded last five games between the two teams
data['Diff_red_cards'] = data['HR_Sum_5'] - data['AR_Sum_5']

### Column inspection

Check to see if any of the new columns have missing values

In [128]:
data[data.isnull().any(axis=1)]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,Diff_corners_awarded,HC_Sum_5_opponent,AC_Sum_5_opponent,Diff_corners_conceded,HY_Sum_5,AY_Sum_5,Diff_yellow_cards,HR_Sum_5,AR_Sum_5,Diff_red_cards


Identify which columns we need to remove. We can only have columns that do not say anything about the outcome of the match, only outcome of previous matches.

In [129]:
print(data.columns)

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY',
       'AY', 'HR', 'AR', 'Season', 'Home ELO', 'Away ELO', 'ELO diff',
       'Home_prob_ELO', 'Draw_prob_ELO', 'Away_prob_ELO', 'D', 'A', 'H',
       'FTHG_Sum_5', 'FTAG_Sum_5', 'Diff_goals_scored', 'FTHG_Sum_5_opponent',
       'FTAG_Sum_5_opponent', 'Diff_goals_conceded',
       'Home Goal Difference last 5', 'Away Goal Difference last 5',
       'Matchrating', 'Home_Points_5', 'Away_Points_5', 'Diff_points',
       'Home ELO_Change_5', 'Away ELO_Change_5', 'Diff_change_in_ELO',
       'Home ELO_Mean_5_opponent', 'Away ELO_Mean_5_opponent',
       'Diff_opposition_mean_ELO', 'HST_Sum_5', 'AST_Sum_5',
       'Diff_shots_on_target_attempted', 'HST_Sum_5_opponent',
       'AST_Sum_5_opponent', 'Diff_shots_on_target_allowed', 'HS_Sum_5',
       'AS_Sum_5', 'Diff_shots_attempted', 'HS_Sum_5_opponent',
       'AS_Sum_5_opponent', 'Diff_

### Create new dataframe with only the features we want to use

In [130]:
# featureset_data_frame = data[['Div','Date', 'HomeTeam', 'AwayTeam', 'H', 'D', 'A', 'FTHG', 'FTAG', 'Home ELO', 'Away ELO', 'ELO diff', 'Home_prob_ELO', 'Draw_prob_ELO', 'Away_prob_ELO', 'Diff_goals_scored', 'Diff_goals_conceded', 'Diff_goal_diff', 'Diff_points', 'Diff_change_in_ELO', 'Diff_opposition_mean_ELO', 'Diff_shots_on_target_attempted', 'Diff_shots_on_target_allowed', 'Diff_shots_attempted', 'Diff_shots_allowed', 'Diff_corners_awarded', 'Diff_corners_conceded', 'Diff_yellow_cards', 'Diff_red_cards', 'Season']]

columns_to_remove = [
    "FTR",
    "HTHG",
    "HTAG",
    "HTR",
    "HS",
    "AS",
    "HST",
    "AST",
    "HF",
    "AF",
    "HC",
    "AC",
    "HY",
    "AY",
    "HR",
    "AR",
    "Home ELO",
    "Away ELO",
    "D",
    "A",
    "H",
    "FTHG_Sum_5",
    "FTAG_Sum_5",
    "FTHG_Sum_5_opponent",
    "FTAG_Sum_5_opponent",
    "Home Goal Difference last 5",
    "Away Goal Difference last 5",
    "Home_Points_5",
    "Away_Points_5",
    "Home ELO_Change_5",
    "Away ELO_Change_5",
    "Home ELO_Mean_5_opponent",
    "Away ELO_Mean_5_opponent",
    "HST_Sum_5",
    "AST_Sum_5",
    "HST_Sum_5_opponent",
    "AST_Sum_5_opponent",
    "HS_Sum_5",
    "AS_Sum_5",
    "HS_Sum_5_opponent",
    "AS_Sum_5_opponent",
    "HC_Sum_5",
    "AC_Sum_5",
    "HC_Sum_5_opponent",
    "AC_Sum_5_opponent",
    "HY_Sum_5",
    "AY_Sum_5",
    "HR_Sum_5",
    "AR_Sum_5",
]
featureset_data_frame = data.drop(columns=columns_to_remove)

Check the different datatypes we have in our dataframe.

In [131]:
print(featureset_data_frame.dtypes)

Div                                       object
Date                              datetime64[ns]
HomeTeam                                  object
AwayTeam                                  object
FTHG                                     float64
FTAG                                     float64
Season                                    object
ELO diff                                  object
Home_prob_ELO                             object
Draw_prob_ELO                             object
Away_prob_ELO                             object
Diff_goals_scored                         object
Diff_goals_conceded                       object
Matchrating                               object
Diff_points                               object
Diff_change_in_ELO                        object
Diff_opposition_mean_ELO                  object
Diff_shots_on_target_attempted            object
Diff_shots_on_target_allowed              object
Diff_shots_attempted                      object
Diff_shots_allowed  

Making columns that should be numbers into numbers.

In [132]:
columns_to_convert = [
    "FTHG",
    "FTAG",
    "ELO diff",
    "Home_prob_ELO",
    "Draw_prob_ELO",
    "Away_prob_ELO",
    "Diff_goals_scored",
    "Diff_goals_conceded",
    "Matchrating",
    "Diff_points",
    "Diff_change_in_ELO",
    "Diff_opposition_mean_ELO",
    "Diff_shots_on_target_attempted",
    "Diff_shots_on_target_allowed",
    "Diff_shots_attempted",
    "Diff_shots_allowed",
    "Diff_corners_awarded",
    "Diff_corners_conceded",
    "Diff_yellow_cards",
    "Diff_red_cards",
]

# Convert each column to numeric, forcing non-numeric values to NaN
for column in columns_to_convert:
    featureset_data_frame[column] = pd.to_numeric(featureset_data_frame[column], errors='coerce')

# Verify the conversion
print("Data types after conversion:")
print(featureset_data_frame[columns_to_convert].dtypes)

Data types after conversion:
FTHG                              float64
FTAG                              float64
ELO diff                          float64
Home_prob_ELO                     float64
Draw_prob_ELO                     float64
Away_prob_ELO                     float64
Diff_goals_scored                   int64
Diff_goals_conceded                 int64
Matchrating                         int64
Diff_points                         int64
Diff_change_in_ELO                float64
Diff_opposition_mean_ELO          float64
Diff_shots_on_target_attempted      int64
Diff_shots_on_target_allowed        int64
Diff_shots_attempted                int64
Diff_shots_allowed                  int64
Diff_corners_awarded                int64
Diff_corners_conceded               int64
Diff_yellow_cards                   int64
Diff_red_cards                      int64
dtype: object


Now we inspect the types again

In [133]:
print(featureset_data_frame.dtypes)

Div                                       object
Date                              datetime64[ns]
HomeTeam                                  object
AwayTeam                                  object
FTHG                                     float64
FTAG                                     float64
Season                                    object
ELO diff                                 float64
Home_prob_ELO                            float64
Draw_prob_ELO                            float64
Away_prob_ELO                            float64
Diff_goals_scored                          int64
Diff_goals_conceded                        int64
Matchrating                                int64
Diff_points                                int64
Diff_change_in_ELO                       float64
Diff_opposition_mean_ELO                 float64
Diff_shots_on_target_attempted             int64
Diff_shots_on_target_allowed               int64
Diff_shots_attempted                       int64
Diff_shots_allowed  

In [134]:
featureset_data_frame

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,...,Diff_change_in_ELO,Diff_opposition_mean_ELO,Diff_shots_on_target_attempted,Diff_shots_on_target_allowed,Diff_shots_attempted,Diff_shots_allowed,Diff_corners_awarded,Diff_corners_conceded,Diff_yellow_cards,Diff_red_cards
0,E0,2005-08-13,Aston Villa,Bolton,2.0,2.0,0506,0.000000,0.467949,0.207027,...,0.000000,0.000000,0,0,0,0,0,0,0,0
1,E0,2005-08-13,Everton,Man United,0.0,2.0,0506,0.000000,0.467949,0.207027,...,0.000000,0.000000,0,0,0,0,0,0,0,0
2,E0,2005-08-13,Fulham,Birmingham,0.0,0.0,0506,0.000000,0.467949,0.207027,...,0.000000,0.000000,0,0,0,0,0,0,0,0
3,E0,2005-08-13,Man City,West Brom,0.0,0.0,0506,0.000000,0.467949,0.207027,...,0.000000,0.000000,0,0,0,0,0,0,0,0
4,E0,2005-08-13,Middlesbrough,Liverpool,0.0,0.0,0506,0.000000,0.467949,0.207027,...,0.000000,0.000000,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7215,E0,2024-05-19,Crystal Palace,Aston Villa,5.0,0.0,2324,-90.655703,0.335050,0.213414,...,72.723069,-65.930720,15,-16,28,-42,5,-5,4,1
7216,E0,2024-05-19,Liverpool,Wolves,2.0,0.0,2324,243.468628,0.806489,0.075298,...,4.430033,-47.974268,22,-6,56,-18,16,-9,-10,0
7217,E0,2024-05-19,Luton,Fulham,2.0,4.0,2324,-112.212233,0.312037,0.198755,...,-15.373054,-58.254969,-4,11,-14,35,7,4,-1,-1
7218,E0,2024-05-19,Man City,West Ham,3.0,1.0,2324,314.894768,0.864611,0.052681,...,33.032141,-20.597507,14,-28,2,-59,0,-13,-4,0


### Removing the first 5 matches for each team in each season

In [135]:
data_with_removed6 = featureset_data_frame.copy()
data_with_removed6 = util.remove_the_first_n_matches_in_a_season_for_each_team(
    data_with_removed6, 5
)
data_with_removed6
final_data_set = data_with_removed6.copy()

number of matches getting removed:  965


In [136]:
final_data_set

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,...,Diff_change_in_ELO,Diff_opposition_mean_ELO,Diff_shots_on_target_attempted,Diff_shots_on_target_allowed,Diff_shots_attempted,Diff_shots_allowed,Diff_corners_awarded,Diff_corners_conceded,Diff_yellow_cards,Diff_red_cards
0,E0,2005-09-17,Aston Villa,Tottenham,1.0,1.0,0506,-25.173204,0.423508,0.224320,...,-4.599564,-15.115140,-9,10,-14,16,20,18,-6,0
1,E0,2005-09-17,Portsmouth,Birmingham,1.0,1.0,0506,6.045620,0.478503,0.202921,...,26.619260,-5.363651,4,-2,4,-4,0,13,1,0
2,E0,2005-09-17,Sunderland,West Brom,1.0,1.0,0506,-32.751187,0.410018,0.229569,...,-12.177547,17.786764,9,-1,-4,8,5,0,-3,1
3,E0,2005-09-18,Blackburn,Newcastle,0.0,3.0,0506,34.014412,0.526477,0.184254,...,34.014412,3.552154,1,-13,7,-15,5,-14,1,0
4,E0,2005-09-18,Man City,Bolton,0.0,1.0,0506,33.333649,0.525329,0.184700,...,37.907289,3.634728,3,3,-8,18,-4,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,E0,2024-05-19,Crystal Palace,Aston Villa,5.0,0.0,2324,-90.655703,0.335050,0.213414,...,72.723069,-65.930720,15,-16,28,-42,5,-5,4,1
6251,E0,2024-05-19,Liverpool,Wolves,2.0,0.0,2324,243.468628,0.806489,0.075298,...,4.430033,-47.974268,22,-6,56,-18,16,-9,-10,0
6252,E0,2024-05-19,Luton,Fulham,2.0,4.0,2324,-112.212233,0.312037,0.198755,...,-15.373054,-58.254969,-4,11,-14,35,7,4,-1,-1
6253,E0,2024-05-19,Man City,West Ham,3.0,1.0,2324,314.894768,0.864611,0.052681,...,33.032141,-20.597507,14,-28,2,-59,0,-13,-4,0


In [137]:
final_data_set.to_csv("../data/DataForModel.csv", index=False)