In [40]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, OneHotEncoder


Load data:

In [42]:
df = pd.read_csv('EPL_2019_2022.csv')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Columns: 107 entries, Div to season
dtypes: float64(82), int64(17), object(8)
memory usage: 953.1+ KB


In [44]:
df.head(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,season
0,E0,2019-08-09,20:00,Liverpool,Norwich,4,1,1,4,0,...,-2.25,1.91,1.99,1.94,1.98,1.99,2.07,1.9,1.99,2019-2020
1,E0,2019-08-10,12:30,West Ham,Man City,0,5,0,0,1,...,1.75,1.95,1.95,1.96,1.97,2.07,1.98,1.97,1.92,2019-2020
2,E0,2019-08-10,15:00,Bournemouth,Sheffield United,1,1,0,0,0,...,-0.5,1.95,1.95,1.98,1.95,2.0,1.96,1.96,1.92,2019-2020
3,E0,2019-08-10,15:00,Burnley,Southampton,3,0,1,0,0,...,0.0,1.87,2.03,1.89,2.03,1.9,2.07,1.86,2.02,2019-2020
4,E0,2019-08-10,15:00,Crystal Palace,Everton,0,0,0,0,0,...,0.25,1.82,2.08,1.97,1.96,2.03,2.08,1.96,1.93,2019-2020


Get weather data:

In [45]:
df['HomeTeam'].unique()

array(['Liverpool', 'West Ham', 'Bournemouth', 'Burnley',
       'Crystal Palace', 'Watford', 'Tottenham', 'Leicester', 'Newcastle',
       'Man United', 'Arsenal', 'Aston Villa', 'Brighton', 'Everton',
       'Norwich', 'Southampton', 'Man City', 'Sheffield United',
       'Chelsea', 'Wolves', 'Fulham', 'West Brom', 'Leeds', 'Brentford'],
      dtype=object)

In [None]:
team_to_stadium = {
    'Liverpool': 'Anfield',
    'West Ham': 'London Stadium',
    'Bournemouth': 'Vitality Stadium',
    'Burnley': 'Turf Moor',
    'Crystal Palace': 'Selhurst Park',
    'Watford': 'Vicarage Road',
    'Tottenham': 'Tottenham Hotspur Stadium',
    'Leicester': 'King Power Stadium',
    'Newcastle': 'St. James\' Park',
    'Man United': 'Old Trafford',
    'Arsenal': 'Emirates Stadium',
    'Aston Villa': 'Villa Park',
    'Man City': 'Etihad Stadium',
    'Brighton': 'Amex Stadium',
    'Norwich': 'Carrow Road',
    'Southampton': 'St Mary\'s Stadium',
    'Everton': 'Goodison Park',
    'Sheffield United': 'Bramall Lane',
    'Chelsea': 'Stamford Bridge',
    'Wolves': 'Molineux Stadium',
    'Fulham': 'Craven Cottage',
    'West Brom': 'The Hawthorns',
    'Leeds': 'Elland Road',
    'Brentford': 'Brentford Community Stadium'
}

In [None]:
stadium_to_coords = {
    'Anfield': (53.4308, -2.9608),
    'London Stadium': (51.5387, -0.0166),
    'Vitality Stadium': (50.7352, -1.8383),
    'Turf Moor': (53.7890, -2.2302),
    'Selhurst Park': (51.3983, -0.0855),
    'Vicarage Road': (51.6498, -0.4017),
    'Tottenham Hotspur Stadium': (51.6043, -0.0674),
    'King Power Stadium': (52.6204, -1.1422),
    'St. James\' Park': (54.9756, -1.6217),
    'Old Trafford': (53.4631, -2.2913),
    'Emirates Stadium': (51.5549, -0.1084),
    'Villa Park': (52.5092, -1.8849),
    'Etihad Stadium': (53.4831, -2.2003),
    'Amex Stadium': (50.8616, -0.0837),
    'Carrow Road': (52.6221, 1.3086),
    'St Mary\'s Stadium': (50.9058, -1.3911),
    'Goodison Park': (53.4388, -2.9663),
    'Bramall Lane': (53.3703, -1.4714),
    'Stamford Bridge': (51.4817, -0.1910),
    'Molineux Stadium': (52.5902, -2.1304),
    'Craven Cottage': (51.4753, -0.2216),
    'The Hawthorns': (52.5091, -1.9639),
    'Elland Road': (53.7776, -1.5724),
    'Brentford Community Stadium': (51.4908, -0.2887)
}

In [None]:
def add_stadium_lat_long(df):
    """Add stadium name and coordinates to the dataframe based on HomeTeam."""
    df['Stadium'] = df['HomeTeam'].map(team_to_stadium).fillna('Unknown Stadium')
    df['Longitude'] = df['Stadium'].map(lambda x: stadium_to_coords.get(x, (None, None))[1])
    df['Latitude'] = df['Stadium'].map(lambda x: stadium_to_coords.get(x, (None, None))[0])
    return df

In [None]:
df_generated = add_stadium_lat_long(df)

In [None]:
import requests
import time

# Sample API key (replace with your own)
API_KEY = 'bc067b9f91e829eae4aa92b4f319acd3'
BASE_URL = 'https://api.openweathermap.org/data/3.0/onecall/timemachine'

# Function to get weather data for a specific location and date
def get_weather(lat, lon, dt):
    params = {
        'lat': lat,
        'lon': lon,
        'dt': dt,
        'appid': API_KEY,
        'units': 'metric'
    }
    response = requests.get(BASE_URL, params=params)
    return response.json()


# Assuming the date is in the format 'yyyy-mm-dd' and time is 'hh:mm'
df_generated['datetime'] = pd.to_datetime(df_generated['Date'].dt.strftime('%Y-%m-%d') + ' ' + df_generated['Time'])

# Convert the 'datetime' to a Unix timestamp
df_generated['timestamp'] = df_generated['datetime'].apply(lambda x: int(x.timestamp()))

weather_data = []

for index, row in df_generated.iterrows():

    lat = row['Latitude']
    lon = row['Longitude']
    timestamp = row['timestamp']
    weather = get_weather(lat, lon, timestamp)
    weather_data.append(weather)


df_generated['Weather'] = weather_data

# Save the updated DataFrame with weather data to a new CSV file
df_generated.to_csv('EPL_with_weather_full.csv', index=False)

Load saved file:

In [None]:
df_generated = pd.read_csv('EPL_with_weather_full.csv')


In [None]:
df_generated.head(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Away_Avg_AY,Away_Avg_AR,Home_Team_Rest,Away_Team_Rest,Stadium,Longitude,Latitude,datetime,timestamp,Weather
0,E0,2019-08-09,20:00,Liverpool,Norwich,4,1,1,4,0,...,,,0,0,Anfield,-2.9608,53.4308,2019-08-09 20:00:00,1565380800,"{'lat': 53.4308, 'lon': -2.9608, 'timezone': '..."
1,E0,2019-08-10,12:30,West Ham,Man City,0,5,0,0,1,...,,,0,0,London Stadium,-0.0166,51.5387,2019-08-10 12:30:00,1565440200,"{'lat': 51.5387, 'lon': -0.0166, 'timezone': '..."
2,E0,2019-08-10,15:00,Bournemouth,Sheffield United,1,1,0,0,0,...,,,0,0,Vitality Stadium,-1.8383,50.7352,2019-08-10 15:00:00,1565449200,"{'lat': 50.7352, 'lon': -1.8383, 'timezone': '..."
3,E0,2019-08-10,15:00,Burnley,Southampton,3,0,1,0,0,...,,,0,0,Turf Moor,-2.2302,53.789,2019-08-10 15:00:00,1565449200,"{'lat': 53.789, 'lon': -2.2302, 'timezone': 'E..."
4,E0,2019-08-10,15:00,Crystal Palace,Everton,0,0,0,0,0,...,,,0,0,Selhurst Park,-0.0855,51.3983,2019-08-10 15:00:00,1565449200,"{'lat': 51.3983, 'lon': -0.0855, 'timezone': '..."


In [None]:
import ast
def extract_weather_info(weather_str):
    try:
        # Convert the string to a dictionary
        weather_json = ast.literal_eval(weather_str)
        
        # Extract relevant data
        if weather_json and 'data' in weather_json and len(weather_json['data']) > 0:
            data = weather_json['data'][0]
            temp = data.get('temp')
            humidity = data.get('humidity')
            wind_speed = data.get('wind_speed')
            wind_direction = data.get('wind_deg')
            weather_code = data['weather'][0]['main'] if 'weather' in data and len(data['weather']) > 0 else None
            precipitation = data['rain'].get('1h') if 'rain' in data else 0
            return temp, humidity, wind_direction, precipitation, wind_speed, weather_code
    except (ValueError, SyntaxError):
        # Handle cases where the string cannot be parsed
        return None, None, None, None, None, None
    return None, None, None, None, None, None


In [None]:
df_generated[['Temperature', 'Humidity', 'WindDirection', 'Precipitation', 'WindSpeed', 'WeatherCode']] = df_generated['Weather'].apply(lambda x: pd.Series(extract_weather_info(x)))


Save file again:

In [69]:
df_generated.to_csv('EPL_with_weather_full.csv', index=False)

Reload from files:

In [2]:
df_generated = pd.read_csv('EPL_with_weather_full.csv')

In [3]:
df_generated.head(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Latitude,datetime,timestamp,Weather,Temperature,Humidity,WindDirection,Precipitation,WindSpeed,WeatherCode
0,E0,2019-08-09,20:00,Liverpool,Norwich,4,1,1,4,0,...,53.4308,2019-08-09 20:00:00,1565380800,"{'lat': 53.4308, 'lon': -2.9608, 'timezone': '...",18.6,77,180,0.0,7.7,Clouds
1,E0,2019-08-10,12:30,West Ham,Man City,0,5,0,0,1,...,51.5387,2019-08-10 12:30:00,1565440200,"{'lat': 51.5387, 'lon': -0.0166, 'timezone': '...",21.39,63,230,0.0,10.3,Clouds
2,E0,2019-08-10,15:00,Bournemouth,Sheffield United,1,1,0,0,0,...,50.7352,2019-08-10 15:00:00,1565449200,"{'lat': 50.7352, 'lon': -1.8383, 'timezone': '...",19.14,74,250,0.0,12.9,Clouds
3,E0,2019-08-10,15:00,Burnley,Southampton,3,0,1,0,0,...,53.789,2019-08-10 15:00:00,1565449200,"{'lat': 53.789, 'lon': -2.2302, 'timezone': 'E...",16.33,96,236,2.29,7.05,Rain
4,E0,2019-08-10,15:00,Crystal Palace,Everton,0,0,0,0,0,...,51.3983,2019-08-10 15:00:00,1565449200,"{'lat': 51.3983, 'lon': -0.0855, 'timezone': '...",20.56,66,230,0.0,12.3,Rain


In [4]:
df_generated['WeatherCode'].value_counts()

Clouds     577
Rain       289
Clear      212
Drizzle     23
Mist        16
Haze        10
Fog          8
Snow         5
Name: WeatherCode, dtype: int64

In [5]:
def calculate_prv_stats(df):
    # Ensure Date column is in datetime format and sort the dataframe by date
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date').reset_index(drop=True)

    # Create new columns to store the statistics
    df['Home_Prv_FTR'] = None
    df['Home_Prv_FTHG'] = 0.0
    df['Home_Prv_HS'] = 0.0
    df['Home_Prv_HST'] = 0.0
    df['Home_Prv_HF'] = 0.0
    df['Home_Prv_HC'] = 0.0
    df['Home_Prv_HY'] = 0.0
    df['Home_Prv_HR'] = 0.0
    df['Home_Prv_3_FTHG'] = 0.0

    df['Away_Prv_FTR'] = None
    df['Away_Prv_FTAG'] = 0.0
    df['Away_Prv_AS'] = 0.0
    df['Away_Prv_AST'] = 0.0
    df['Away_Prv_AF'] = 0.0
    df['Away_Prv_AC'] = 0.0
    df['Away_Prv_AY'] = 0.0
    df['Away_Prv_AR'] = 0.0
    df['Away_Prv_3_FTAG'] = 0.0

    # Iterate through each row to calculate statistics
    for index, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']

        # Get the previous games for the home team
        home_prev_games = df[(df['HomeTeam'] == home_team) & (df['Date'] < row['Date'])]

        if not home_prev_games.empty:
            last_home_game = home_prev_games.iloc[-1]
            df.at[index, 'Home_Prv_FTR'] = last_home_game['FTR']
            df.at[index, 'Home_Prv_FTHG'] = last_home_game['FTHG']
            df.at[index, 'Home_Prv_HS'] = last_home_game['HS']
            df.at[index, 'Home_Prv_HST'] = last_home_game['HST']
            df.at[index, 'Home_Prv_HF'] = last_home_game['HF']
            df.at[index, 'Home_Prv_HC'] = last_home_game['HC']
            df.at[index, 'Home_Prv_HY'] = last_home_game['HY']
            df.at[index, 'Home_Prv_HR'] = last_home_game['HR']

            last_3_home_games = home_prev_games.tail(3)
            df.at[index, 'Home_Prv_3_FTHG'] = last_3_home_games['FTHG'].mean()

        # Get the previous games for the away team
        away_prev_games = df[(df['AwayTeam'] == away_team) & (df['Date'] < row['Date'])]

        if not away_prev_games.empty:
            last_away_game = away_prev_games.iloc[-1]
            df.at[index, 'Away_Prv_FTR'] = last_away_game['FTR']
            df.at[index, 'Away_Prv_FTAG'] = last_away_game['FTAG']
            df.at[index, 'Away_Prv_AS'] = last_away_game['AS']
            df.at[index, 'Away_Prv_AST'] = last_away_game['AST']
            df.at[index, 'Away_Prv_AF'] = last_away_game['AF']
            df.at[index, 'Away_Prv_AC'] = last_away_game['AC']
            df.at[index, 'Away_Prv_AY'] = last_away_game['AY']
            df.at[index, 'Away_Prv_AR'] = last_away_game['AR']

            last_3_away_games = away_prev_games.tail(3)
            df.at[index, 'Away_Prv_3_FTAG'] = last_3_away_games['FTAG'].mean()


    return df

In [6]:
# Assuming 'df' is your original DataFrame
df_generated = calculate_prv_stats(df_generated)

In [8]:
df_generated.tail(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Home_Prv_3_FTHG,Away_Prv_FTR,Away_Prv_FTAG,Away_Prv_AS,Away_Prv_AST,Away_Prv_AF,Away_Prv_AC,Away_Prv_AY,Away_Prv_AR,Away_Prv_3_FTAG
1135,E0,2022-05-22,16:00,Liverpool,Wolves,3,1,1,1,1,...,2.333333,0,2.0,14.0,4.0,11.0,3.0,3.0,0.0,0.666667
1136,E0,2022-05-22,16:00,Crystal Palace,Man United,1,0,1,1,0,...,1.333333,1,0.0,15.0,5.0,9.0,6.0,2.0,0.0,0.333333
1137,E0,2022-05-22,16:00,Leicester,Southampton,4,1,1,0,0,...,1.333333,1,0.0,19.0,4.0,9.0,7.0,1.0,0.0,0.666667
1138,E0,2022-05-22,16:00,Burnley,Newcastle,1,2,0,0,1,...,1.333333,1,0.0,7.0,3.0,11.0,2.0,3.0,0.0,1.333333
1139,E0,2022-05-22,16:00,Norwich,Tottenham,0,5,0,0,2,...,0.666667,0,1.0,8.0,3.0,8.0,5.0,2.0,0.0,1.666667


In [9]:
df_generated[(df_generated['Home_Prv_FTR'].isna()) |(df_generated['Away_Prv_FTR'].isna()) ]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Home_Prv_3_FTHG,Away_Prv_FTR,Away_Prv_FTAG,Away_Prv_AS,Away_Prv_AST,Away_Prv_AF,Away_Prv_AC,Away_Prv_AY,Away_Prv_AR,Away_Prv_3_FTAG
0,E0,2019-08-09,20:00,Liverpool,Norwich,4,1,1,4,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E0,2019-08-10,12:30,West Ham,Man City,0,5,0,0,1,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E0,2019-08-10,15:00,Bournemouth,Sheffield United,1,1,0,0,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E0,2019-08-10,15:00,Burnley,Southampton,3,0,1,0,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E0,2019-08-10,15:00,Crystal Palace,Everton,0,0,0,0,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,E0,2019-08-10,15:00,Watford,Brighton,0,3,0,0,1,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,E0,2019-08-10,17:30,Tottenham,Aston Villa,3,1,1,0,1,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,E0,2019-08-11,14:00,Leicester,Wolves,0,0,0,0,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,E0,2019-08-11,14:00,Newcastle,Arsenal,0,1,0,0,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,E0,2019-08-11,16:30,Man United,Chelsea,4,0,1,1,0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def add_rest_columns(df):
    # Sort the dataframe by date
    df = df.sort_values(by='Date')
    
    # Dictionary to store the last few game dates for each team
    last_game_dates = {}
    
    # Lists to store the rest days and average rest days for home and away teams
    home_team_rest = []
    home_team_avg_rest = []
    away_team_rest = []
    away_team_avg_rest = []
    
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        game_date = row['Date']
        
        # Calculate home team's rest
        if home_team in last_game_dates and len(last_game_dates[home_team]) > 0:
            rest_days = (game_date - last_game_dates[home_team][-1]).days
            home_team_rest.append(rest_days)
        else:
            home_team_rest.append(None)
        
        # Calculate away team's rest
        if away_team in last_game_dates and len(last_game_dates[away_team]) > 0:
            rest_days = (game_date - last_game_dates[away_team][-1]).days
            away_team_rest.append(rest_days)
        else:
            away_team_rest.append(None)
        
        # Calculate home team's average rest over the last 3 games
        if home_team in last_game_dates and len(last_game_dates[home_team]) >= 3:
            last_three_games = last_game_dates[home_team][-3:]
            rest_days_prv_3 = sum([(last_three_games[i] - last_three_games[i-1]).days for i in range(1, 3)])
            rest_days_current = (game_date - last_three_games[-1]).days
            avg_rest_days = (rest_days_prv_3 + rest_days_current)/3
            home_team_avg_rest.append(avg_rest_days)
        else:
            home_team_avg_rest.append(None)
        
        # Calculate away team's average rest over the last 3 games
        if away_team in last_game_dates and len(last_game_dates[away_team]) >= 3:
            last_three_games = last_game_dates[away_team][-3:]
            rest_days_prv_3 = sum([(last_three_games[i] - last_three_games[i-1]).days for i in range(1, 3)])
            rest_days_current = (game_date - last_three_games[-1]).days
            avg_rest_days = (rest_days_prv_3 + rest_days_current)/3
            away_team_avg_rest.append(avg_rest_days)
        else:
            away_team_avg_rest.append(None)
        
        # Update last game dates for home and away teams
        if home_team not in last_game_dates:
            last_game_dates[home_team] = []
        if away_team not in last_game_dates:
            last_game_dates[away_team] = []
        
        last_game_dates[home_team].append(game_date)
        last_game_dates[away_team].append(game_date)
    
    # Add new columns to the dataframe
    df['Home_Team_Rest'] = home_team_rest
    df['Away_Team_Rest'] = away_team_rest
    df['Home_Team_Avg_Rest_Last_3'] = home_team_avg_rest
    df['Away_Team_Avg_Rest_Last_3'] = away_team_avg_rest
    
    return df

In [16]:
df_generated = add_rest_columns(df_generated)

In [17]:
df_generated.tail(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Away_Prv_AST,Away_Prv_AF,Away_Prv_AC,Away_Prv_AY,Away_Prv_AR,Away_Prv_3_FTAG,Home_Team_Rest,Away_Team_Rest,Home_Team_Avg_Rest_Last_3,Away_Team_Avg_Rest_Last_3
1136,E0,2022-05-22,16:00,Crystal Palace,Man United,1,0,1,1,0,...,5.0,9.0,6.0,2.0,0.0,0.333333,3.0,15.0,5.0,8.0
1137,E0,2022-05-22,16:00,Leicester,Southampton,4,1,1,0,0,...,4.0,9.0,7.0,1.0,0.0,0.666667,3.0,5.0,3.666667,7.333333
1135,E0,2022-05-22,16:00,Liverpool,Wolves,3,1,1,1,1,...,4.0,11.0,3.0,3.0,0.0,0.666667,5.0,7.0,5.0,5.0
1134,E0,2022-05-22,16:00,Man City,Aston Villa,3,2,1,0,1,...,7.0,11.0,5.0,2.0,0.0,1.333333,7.0,3.0,4.666667,4.0
1139,E0,2022-05-22,16:00,Norwich,Tottenham,0,5,0,0,2,...,3.0,8.0,5.0,2.0,0.0,1.666667,7.0,7.0,4.666667,5.0


Encode Time of the match to Morning, Afternoon, and Evening:

In [None]:
df_generated['Time'].value_counts()

15:00    276
20:00    142
14:00    122
17:30    103
12:30     94
16:30     88
18:00     74
20:15     62
19:30     37
16:00     32
19:45     32
12:00     28
19:15     19
19:00     13
14:15      9
14:05      5
13:30      1
16:15      1
17:45      1
15:30      1
Name: Time, dtype: int64

In [None]:
def encode_time_of_day(time_str):
    # Convert time string to hours and minutes
    hour, minute = map(int, time_str.split(':'))
    
    # Define the time categories
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Apply the function to the 'Time' column
df_generated['Time_of_Day'] = df_generated['Time'].apply(encode_time_of_day)

# Verify the encoding
df_generated['Time_of_Day'].value_counts()

Afternoon    657
Evening      483
Name: Time_of_Day, dtype: int64

Save to file to check:

In [25]:
df_generated.to_csv('EPL_with_weather_full_features.csv', index=False)

In [31]:
df_generated = pd.read_csv('EPL_with_weather_full_features.csv')

In [32]:
df_generated.tail(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,Away_Prv_AF,Away_Prv_AC,Away_Prv_AY,Away_Prv_AR,Away_Prv_3_FTAG,Home_Team_Rest,Away_Team_Rest,Home_Team_Avg_Rest_Last_3,Away_Team_Avg_Rest_Last_3,Time_of_Day
1135,E0,2022-05-22,16:00,Crystal Palace,Man United,1,0,1,1,0,...,9.0,6.0,2.0,0.0,0.333333,3.0,15.0,5.0,8.0,Afternoon
1136,E0,2022-05-22,16:00,Leicester,Southampton,4,1,1,0,0,...,9.0,7.0,1.0,0.0,0.666667,3.0,5.0,3.666667,7.333333,Afternoon
1137,E0,2022-05-22,16:00,Liverpool,Wolves,3,1,1,1,1,...,11.0,3.0,3.0,0.0,0.666667,5.0,7.0,5.0,5.0,Afternoon
1138,E0,2022-05-22,16:00,Man City,Aston Villa,3,2,1,0,1,...,11.0,5.0,2.0,0.0,1.333333,7.0,3.0,4.666667,4.0,Afternoon
1139,E0,2022-05-22,16:00,Norwich,Tottenham,0,5,0,0,2,...,8.0,5.0,2.0,0.0,1.666667,7.0,7.0,4.666667,5.0,Afternoon


Standardizing features:

In [52]:
scale_features = ['Temperature', 'Humidity', 'WindDirection', 'Precipitation', 'WindSpeed']
numerical_features = [
    'Home_Prv_FTR', 'Home_Prv_FTHG', 'Home_Prv_HS', 'Home_Prv_HST',
    'Home_Prv_HF', 'Home_Prv_HC', 'Home_Prv_HY', 'Home_Prv_HR',
    'Home_Prv_3_FTHG', 'Away_Prv_FTR', 'Away_Prv_FTAG', 'Away_Prv_AS',
    'Away_Prv_AST', 'Away_Prv_AF', 'Away_Prv_AC', 'Away_Prv_AY',
    'Away_Prv_AR', 'Away_Prv_3_FTAG', 'Home_Team_Rest', 'Away_Team_Rest',
    'Home_Team_Avg_Rest_Last_3', 'Away_Team_Avg_Rest_Last_3'
]
encode_features = ['WeatherCode','Time_of_Day']
target_feature = ['FTR']
id_features = ['Date','Time','season']

all_features = []
all_features.extend(scale_features)
all_features.extend(numerical_features)
all_features.extend(encode_features)
all_features.extend(target_feature)
all_features.extend(id_features)

Find number of rows with missing values:

In [53]:
df_used_features = df_generated[all_features]
df_used_features.tail(5)

Unnamed: 0,Temperature,Humidity,WindDirection,Precipitation,WindSpeed,Home_Prv_FTR,Home_Prv_FTHG,Home_Prv_HS,Home_Prv_HST,Home_Prv_HF,...,Home_Team_Rest,Away_Team_Rest,Home_Team_Avg_Rest_Last_3,Away_Team_Avg_Rest_Last_3,WeatherCode,Time_of_Day,FTR,Date,Time,season
1135,22.87,35,170,0.0,4.12,1.0,1.0,15.0,7.0,11.0,...,3.0,15.0,5.0,8.0,Clear,Afternoon,1,2022-05-22,16:00,2021-2022
1136,18.69,62,219,0.0,4.44,1.0,3.0,20.0,8.0,10.0,...,3.0,5.0,3.666667,7.333333,Clouds,Afternoon,1,2022-05-22,16:00,2021-2022
1137,17.12,72,230,0.2,5.66,0.0,1.0,22.0,3.0,12.0,...,5.0,7.0,5.0,5.0,Rain,Afternoon,1,2022-05-22,16:00,2021-2022
1138,19.85,64,220,0.0,5.14,1.0,5.0,21.0,9.0,6.0,...,7.0,3.0,4.666667,4.0,Clouds,Afternoon,1,2022-05-22,16:00,2021-2022
1139,18.2,49,160,0.0,4.12,0.0,0.0,8.0,2.0,7.0,...,7.0,7.0,4.666667,5.0,Clouds,Afternoon,0,2022-05-22,16:00,2021-2022


In [54]:
rows_with_na = df_used_features[df_used_features.isna().any(axis=1)]
rows_with_na.shape

(41, 33)

Drop NA values:

In [60]:
df_cleaned = df_used_features.dropna()

Scaling features:

In [61]:
scaler = StandardScaler()
df_cleaned[scale_features] = scaler.fit_transform(df_cleaned[scale_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Encoding features:

In [62]:
df_cleaned = pd.get_dummies(df_cleaned, columns=encode_features)

In [65]:
df_cleaned['Date']

30      2019-08-31
31      2019-08-31
32      2019-08-31
33      2019-08-31
34      2019-08-31
           ...    
1135    2022-05-22
1136    2022-05-22
1137    2022-05-22
1138    2022-05-22
1139    2022-05-22
Name: Date, Length: 1099, dtype: object

In [66]:
df_cleaned.to_csv('EPL_Final_cleaned_features.csv', index=False)