### ACCESSING DATA - TEST DATA 

#### Data Quality and Tidiness Issues

In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import pickle
from sklearn.metrics import r2_score, mean_squared_error
#get current season data from FPL API endpoints and identify the keys
fpl_base_url = 'https://fantasy.premierleague.com/api/'
current_season = requests.get(fpl_base_url+'bootstrap-static/').json()
#json = r.json()
current_season.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [2]:
#create dataframes for the current season dictionary keys for data exploration
#- Contains summary of Gameweek data
events_df = pd.DataFrame(current_season['events']) #
phases_df = pd.DataFrame(current_season['phases']) #Shows calendar months for game weeks
teams_df = pd.DataFrame(current_season['teams'])
players_df = pd.DataFrame(current_season['elements'])
element_stats_df = pd.DataFrame(current_season['element_stats'])
element_types_df = pd.DataFrame(current_season['element_types'])

#Code to extract weekly game data for all active players in current season
for x in players_df.index :
    print(x)
    player_id = players_df.id[x]
    url = f'https://fantasy.premierleague.com/api/element-summary/{player_id}/'
    r = requests.get(url)
    json = r.json()
    json_history_df = pd.DataFrame(json['history'])
    json_history_past_df = pd.DataFrame(json['history_past'])

       
    if x == 0 :
        all_history_df = json_history_df
        #all_history_past_df = json_history_past_df
    else : 
        all_history_df = all_history_df.append(json_history_df)
        #all_history_past_df = all_history_past_df.append(json_history_past_df)

In [3]:
#Code to save the all players game week data in current and past seasons to csvs
#all_history_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/current_season.csv')
#all_history_past_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/past_seasons.csv')

In [4]:
# Read Data
df_currentseason = pd.read_csv('current_season.csv')
df_currentseason.head()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
0,0,1,1,7,0,False,2022-08-05T19:00:00Z,0,2,1,...,0,0.0,0.0,0.0,0.0,45,0,23970,0,0
1,1,1,11,10,0,True,2022-08-13T14:00:00Z,4,2,2,...,0,0.0,0.0,0.0,0.0,44,-5169,24193,1361,6530
2,2,1,21,3,0,False,2022-08-20T16:30:00Z,0,3,3,...,0,0.0,0.0,0.0,0.0,44,-4337,20960,879,5216
3,3,1,31,9,0,True,2022-08-27T16:30:00Z,2,1,4,...,0,0.0,0.0,0.0,0.0,43,-2988,18825,577,3565
4,4,1,41,2,0,True,2022-08-31T18:30:00Z,2,1,5,...,0,0.0,0.0,0.0,0.0,43,-1611,17790,405,2016


In [5]:
# Print all columns
df_currentseason.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out'],
      dtype='object')

In [6]:
#Descriptive information on features
df_currentseason.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4019 entries, 0 to 4018
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4019 non-null   int64  
 1   element            4019 non-null   int64  
 2   fixture            4019 non-null   int64  
 3   opponent_team      4019 non-null   int64  
 4   total_points       4019 non-null   int64  
 5   was_home           4019 non-null   bool   
 6   kickoff_time       4019 non-null   object 
 7   team_h_score       4019 non-null   int64  
 8   team_a_score       4019 non-null   int64  
 9   round              4019 non-null   int64  
 10  minutes            4019 non-null   int64  
 11  goals_scored       4019 non-null   int64  
 12  assists            4019 non-null   int64  
 13  clean_sheets       4019 non-null   int64  
 14  goals_conceded     4019 non-null   int64  
 15  own_goals          4019 non-null   int64  
 16  penalties_saved    4019 

In [7]:
# Check for any missing values.
df_currentseason.isnull().values.any()

False

In [8]:
#Check for duplicates on each row.
df_currentseason.duplicated().value_counts()

False    4019
dtype: int64

In [9]:
#Check for unique values
df_currentseason.nunique()

Unnamed: 0              7
element               631
fixture                67
opponent_team          20
total_points           25
was_home                2
kickoff_time           41
team_h_score            8
team_a_score            4
round                   7
minutes                83
goals_scored            4
assists                 4
clean_sheets            2
goals_conceded         10
own_goals               2
penalties_saved         2
penalties_missed        2
yellow_cards            2
red_cards               2
saves                  10
bonus                   4
bps                    67
influence             279
creativity            382
threat                 89
ict_index             164
value                  70
transfers_balance    2941
selected             3882
transfers_in         2477
transfers_out        2949
dtype: int64

In [10]:
#Descriptive Statistics
df_currentseason.describe()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,team_h_score,team_a_score,round,minutes,goals_scored,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
count,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,...,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0,4019.0
mean,2.80219,302.236626,35.923115,10.601393,1.374222,1.818363,1.151779,4.035581,32.8736,0.04628,...,6.021647,7.342025,4.862752,5.583478,1.778253,50.458074,1377.342,235667.7,21476.83,20099.49
std,1.93792,174.78856,21.601435,5.838149,2.502222,1.74318,0.92089,2.128986,40.02644,0.239977,...,9.577682,12.881125,10.55479,13.574956,3.032116,11.64849,90477.95,691429.4,80661.01,65190.91
min,0.0,1.0,1.0,1.0,-4.0,0.0,0.0,1.0,0.0,0.0,...,-15.0,0.0,0.0,0.0,0.0,39.0,-1999266.0,0.0,0.0,0.0
25%,1.0,151.0,18.0,5.5,0.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,45.0,-3242.5,7465.5,43.5,329.0
50%,3.0,306.0,35.0,11.0,0.0,2.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,45.0,-327.0,29552.0,973.0,2345.0
75%,4.0,452.0,51.0,16.0,2.0,2.0,2.0,6.0,90.0,0.0,...,10.0,10.8,2.85,2.0,2.6,54.0,0.0,129463.0,6895.0,13290.0
max,6.0,631.0,80.0,20.0,22.0,9.0,3.0,8.0,90.0,3.0,...,81.0,125.8,97.3,129.0,30.4,130.0,1332069.0,8407273.0,1384843.0,2041377.0


In [11]:
# Code definition deal with white space in the dataframes
def whitespace_remover(dataframe):
   
    # iterating over the columns
    for i in dataframe.columns:
         
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':
             
            # applying strip function on column
            dataframe[i] = dataframe[i].map(str.strip)
        else:
             
            # if condn. is False then it will do nothing.
            pass

In [12]:
# Extracting overall current season league table text from html_text
url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_2022 = soup.find('table', id='results2022-202391_overall')
# Obtain every title of columns with tag <th>
league_table_2022 = []
for i in table_2022.find_all('td'):
 title = i.text
 league_table_2022.append(title)
league_table_2022 = [league_table_2022[i: i+19] for i in range(0, len(league_table_2022), 19)]
league_table_2022 = pd.DataFrame(league_table_2022, columns = ['Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'xG', 'xGA', 'xGD', 'xGD/90', 'Last 5', 'Attendance', 'Top Team Scorer', 'Goalkeeper', 'Notes'])
league_table_2022['Rank'] = range(1, 1+len(league_table_2022))
league_table_2022.drop(['xG', 'xGA', 'xGD', 'xGD/90',], axis=1, inplace=True)
league_table_2022['year'] = league_table_2022.apply(lambda x: "2022-23", axis=1)
whitespace_remover(league_table_2022)
league_table_2022.head()
#data.text

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,Rank,year
0,Arsenal,8,7,0,1,20,8,12,21,2.63,W W L W W,60122,Gabriel Jesus - 5,Aaron Ramsdale,,1,2022-23
1,Manchester City,8,6,2,0,29,9,20,20,2.5,W W D W W,53362,Erling Haaland - 14,Ederson,,2,2022-23
2,Tottenham,8,5,2,1,19,10,9,17,2.13,W D W W L,61530,Harry Kane - 7,Hugo Lloris,,3,2022-23
3,Brighton,7,4,2,1,14,8,6,14,2.0,W W L W D,31230,Leandro Trossard - 5,Robert Sánchez,,4,2022-23
4,Chelsea,7,4,1,2,10,10,0,13,1.86,L W L W W,39941,Raheem Sterling - 3,Edouard Mendy,,5,2022-23


In [13]:
# Extracting home and away season league table text from html_text
#url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
#data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_hw_2022 = soup.find('table', id='results2022-202391_home_away')
# Obtain every title of columns with tag <th>
league_home_away_2022 = []
for i in table_hw_2022.find_all('td'):
    title = i.text
    league_home_away_2022.append(title)
league_home_away_2022 = [league_home_away_2022[i: i+27] for i in range(0, len(league_home_away_2022), 27)]
league_home_away_2022 = pd.DataFrame(league_home_away_2022, columns = ['Squad', 'H_MP', 'H_W', 'H_D', 'H_L', 'H_GF', 'H_GA', 'H_GD', 'H_Pts', 'H_Pts/MP', 'H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_MP', 'A_W', 'A_D', 'A_L', 'A_GF', 'A_GA', 'A_GD', 'A_Pts', 'A_Pts/MP', 'A_xG', 'A_xGA', 'A_xGD', 'A_xGD/90'])
league_home_away_2022.drop(['H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_xG', 'A_xGA', 'A_xGD', 'A_xGD/90'], axis=1, inplace=True)
league_home_away_2022['year'] = league_home_away_2022.apply(lambda x: "2022-23", axis=1)
whitespace_remover(league_home_away_2022)
league_home_away_2022.head()

Unnamed: 0,Squad,H_MP,H_W,H_D,H_L,H_GF,H_GA,H_GD,H_Pts,H_Pts/MP,A_MP,A_W,A_D,A_L,A_GF,A_GA,A_GD,A_Pts,A_Pts/MP,year
0,Arsenal,4,4,0,0,11,5,6,12,3.0,4,3,0,1,9,3,6,9,2.25,2022-23
1,Manchester City,4,4,0,0,20,5,15,12,3.0,4,2,2,0,9,4,5,8,2.0,2022-23
2,Tottenham,4,4,0,0,13,4,9,12,3.0,4,1,2,1,6,6,0,5,1.25,2022-23
3,Brighton,3,2,1,0,6,2,4,7,2.33,4,2,1,1,8,6,2,7,1.75,2022-23
4,Chelsea,3,2,1,0,6,4,2,7,2.33,4,2,0,2,4,6,-2,6,1.5,2022-23


In [14]:
# Convert Attendance feature to numeric datatype
league_table_2022['Attendance'] = pd.to_numeric(league_table_2022['Attendance'].str.replace(',', ''))

In [15]:
# Changing erroneus datatype
convert_dict = {'MP': int, 'W': int, 'D': int, 'L': int, 'GF': int, 'GA': int, 'GD': int, 'Pts': int, 'Pts/MP': float}
convert_dict1 = {'H_MP': int, 'H_W': int, 'H_D': int, 'H_L': int, 'H_GF': int, 'H_GA': int, 'H_GD': int, 'H_Pts': int, 'H_Pts/MP': float, 'A_MP': int, 'A_W': int, 'A_D': int, 'A_L': int, 'A_GF': int, 'A_GA': int, 'A_GD': int, 'A_Pts': int, 'A_Pts/MP': float}

league_table_2022 = league_table_2022.astype(convert_dict)

league_home_away_2022 = league_home_away_2022.astype(convert_dict1)

In [16]:
#Quality check
league_home_away_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Squad     20 non-null     object 
 1   H_MP      20 non-null     int64  
 2   H_W       20 non-null     int64  
 3   H_D       20 non-null     int64  
 4   H_L       20 non-null     int64  
 5   H_GF      20 non-null     int64  
 6   H_GA      20 non-null     int64  
 7   H_GD      20 non-null     int64  
 8   H_Pts     20 non-null     int64  
 9   H_Pts/MP  20 non-null     float64
 10  A_MP      20 non-null     int64  
 11  A_W       20 non-null     int64  
 12  A_D       20 non-null     int64  
 13  A_L       20 non-null     int64  
 14  A_GF      20 non-null     int64  
 15  A_GA      20 non-null     int64  
 16  A_GD      20 non-null     int64  
 17  A_Pts     20 non-null     int64  
 18  A_Pts/MP  20 non-null     float64
 19  year      20 non-null     object 
dtypes: float64(2), int64(16), object(2

### Extract Squad information

In [17]:
# Extracting Squad stats such as age, possession, players used from html_text
#url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
#data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_squad_2022 = soup.find('table', id='stats_squads_standard_for')
# Obtain every title of columns with tag <th>
stats_squad_2022 = []
for i in table_squad_2022.find_all('td'):
    title = i.text
    stats_squad_2022.append(title)
stats_squad_2022 = [stats_squad_2022[i: i+28] for i in range(0, len(stats_squad_2022), 28)]
stats_squad_2022 = pd.DataFrame(stats_squad_2022, columns = ['# Pl', 'Age', 'Poss', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls', 'Ast', 'G+A', 'G-PK', 'G+A-PK', 'xG', 'npxG', 'xA', 'npxG+xA', 'xG', 'xA', 'xG+xA', 'npxG', 'npxG+xA'])
stats_squad_2022.drop(['xG', 'npxG', 'xA', 'npxG+xA', 'xG', 'xA', 'xG+xA', 'npxG', 'npxG+xA'], axis=1, inplace=True)
stats_squad_2022['year'] = stats_squad_2022.apply(lambda x: "2022-23", axis=1)
stats_squad_2022.head()

Unnamed: 0,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,year
0,21,24.5,59.4,8,88,720,8.0,19,14,19,0,0,15,0,2.37,1.75,4.12,2.37,4.12,2022-23
1,21,27.1,50.9,8,88,720,8.0,6,4,6,0,0,20,0,0.75,0.5,1.25,0.75,1.25,2022-23
2,23,26.5,36.1,8,88,720,8.0,6,5,6,0,0,15,0,0.75,0.62,1.37,0.75,1.37,2022-23
3,22,26.2,46.0,8,88,720,8.0,15,8,14,1,1,10,0,1.87,1.0,2.87,1.75,2.75,2022-23
4,19,27.6,49.4,7,77,630,7.0,13,7,10,3,3,8,0,1.86,1.0,2.86,1.43,2.43,2022-23


In [18]:
# Obtain every title of columns with tag <th>
headers = []
for i in table_squad_2022.find_all('th'):
 title = i.text
 headers.append(title)
headers

['',
 'Playing Time',
 'Performance',
 'Per 90 Minutes',
 'Expected',
 'Per 90 Minutes',
 'Squad',
 '# Pl',
 'Age',
 'Poss',
 'MP',
 'Starts',
 'Min',
 '90s',
 'Gls',
 'Ast',
 'G-PK',
 'PK',
 'PKatt',
 'CrdY',
 'CrdR',
 'Gls',
 'Ast',
 'G+A',
 'G-PK',
 'G+A-PK',
 'xG',
 'npxG',
 'xA',
 'npxG+xA',
 'xG',
 'xA',
 'xG+xA',
 'npxG',
 'npxG+xA',
 'Arsenal',
 'Aston Villa',
 'Bournemouth',
 'Brentford',
 'Brighton',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Leeds United',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester Utd',
 'Newcastle Utd',
 "Nott'ham Forest",
 'Southampton',
 'Tottenham',
 'West Ham',
 'Wolves']

In [19]:
#Extract Team names for 2022 season
teams = headers[35:]
teams

stats_squad_2022['Squad'] = pd.DataFrame(teams, columns = ['Squad'])

stats_squad_2022.head()

Unnamed: 0,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,...,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,year,Squad
0,21,24.5,59.4,8,88,720,8.0,19,14,19,...,0,15,0,2.37,1.75,4.12,2.37,4.12,2022-23,Arsenal
1,21,27.1,50.9,8,88,720,8.0,6,4,6,...,0,20,0,0.75,0.5,1.25,0.75,1.25,2022-23,Aston Villa
2,23,26.5,36.1,8,88,720,8.0,6,5,6,...,0,15,0,0.75,0.62,1.37,0.75,1.37,2022-23,Bournemouth
3,22,26.2,46.0,8,88,720,8.0,15,8,14,...,1,10,0,1.87,1.0,2.87,1.75,2.75,2022-23,Brentford
4,19,27.6,49.4,7,77,630,7.0,13,7,10,...,3,8,0,1.86,1.0,2.86,1.43,2.43,2022-23,Brighton


#### Observation
- No missing records
- No duplicate observation

In [20]:
#get current season fixtures from FPL API endpoint and create Dataframe

current_season_fixtures = requests.get(fpl_base_url+'fixtures/').json()
fixtures_df = pd.DataFrame(current_season_fixtures)
fixtures_df.head()

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2292871,,False,False,61,,0,False,,8,,1,,[],2,4,74971
1,2292921,,False,False,111,,0,False,,13,,1,,[],5,4,75021
2,2292870,,False,False,62,,0,False,,5,,3,,[],3,2,74972
3,2292882,,False,False,73,,0,False,,7,,5,,[],2,3,74983
4,2292883,,False,False,74,,0,False,,12,,6,,[],4,4,74984


In [21]:
#Print all columns
fixtures_df.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'id',
       'kickoff_time', 'minutes', 'provisional_start_time', 'started',
       'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats',
       'team_h_difficulty', 'team_a_difficulty', 'pulse_id'],
      dtype='object')

In [22]:
# Descriptive information on features.
fixtures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   code                    380 non-null    int64  
 1   event                   366 non-null    float64
 2   finished                380 non-null    bool   
 3   finished_provisional    380 non-null    bool   
 4   id                      380 non-null    int64  
 5   kickoff_time            366 non-null    object 
 6   minutes                 380 non-null    int64  
 7   provisional_start_time  380 non-null    bool   
 8   started                 366 non-null    object 
 9   team_a                  380 non-null    int64  
 10  team_a_score            77 non-null     float64
 11  team_h                  380 non-null    int64  
 12  team_h_score            77 non-null     float64
 13  stats                   380 non-null    object 
 14  team_h_difficulty       380 non-null    in

In [23]:
# Check for any missing values.
fixtures_df.isnull().values.any()

True

In [24]:
True
# Check missing values for each feature.
fixtures_df.isna().sum()

code                        0
event                      14
finished                    0
finished_provisional        0
id                          0
kickoff_time               14
minutes                     0
provisional_start_time      0
started                    14
team_a                      0
team_a_score              303
team_h                      0
team_h_score              303
stats                       0
team_h_difficulty           0
team_a_difficulty           0
pulse_id                    0
dtype: int64

In [25]:
# Descriptive statistics.
fixtures_df.describe()

Unnamed: 0,code,event,id,minutes,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id
count,380.0,366.0,380.0,380.0,380.0,77.0,380.0,77.0,380.0,380.0,380.0
mean,2293000.0,19.956284,190.5,18.236842,10.5,1.194805,10.5,1.844156,2.55,2.95,75100.5
std,109.8408,10.929826,109.840794,36.224115,5.773884,1.013578,5.773884,1.76266,0.805735,1.024824,109.840794
min,2292810.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,2.0,74911.0
25%,2292905.0,11.0,95.75,0.0,5.75,0.0,5.75,1.0,2.0,2.0,75005.75
50%,2293000.0,20.0,190.5,0.0,10.5,1.0,10.5,2.0,2.0,3.0,75100.5
75%,2293094.0,29.0,285.25,0.0,15.25,2.0,15.25,2.0,3.0,4.0,75195.25
max,2293189.0,38.0,380.0,90.0,20.0,4.0,20.0,9.0,5.0,5.0,75290.0


#### Observation
- Some missing records related to future games

### FEATURE ENGINEERING

In [26]:
# Make a copy of the original piece of test data.
df_currentseason_clean = df_currentseason.copy()
players_df_clean = players_df.copy()
fixtures_df_clean = fixtures_df.copy()
league_table_2022_clean = league_table_2022.copy()
league_home_away_2022_clean = league_home_away_2022.copy()
stats_squad_2022_clean = stats_squad_2022.copy()

In [27]:
#Function to compute overall team strength, attack strength and defence strength
def calc_team_strength(players):
    players['team_strength'] = 1000
    players['team_strength'] += (players['W'] / players['MP'])*100
    players['team_strength'] -= (players['D'] / players['MP'])*100
    players['team_strength'] -= (players['L'] / players['MP'])*100
    players['attack_strength'] = 1000 
    players['attack_strength'] += (players['GF'] / players['MP'])*100
    players['defence_strength'] = 1000 
    players['defence_strength'] -= (players['GA'] / players['MP'])*100
    return players

In [28]:
league_table_2022_clean = calc_team_strength(league_table_2022_clean)
league_table_2022_clean.head(20)

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,Rank,year,team_strength,attack_strength,defence_strength
0,Arsenal,8,7,0,1,20,8,12,21,2.63,W W L W W,60122,Gabriel Jesus - 5,Aaron Ramsdale,,1,2022-23,1075.0,1250.0,900.0
1,Manchester City,8,6,2,0,29,9,20,20,2.5,W W D W W,53362,Erling Haaland - 14,Ederson,,2,2022-23,1050.0,1362.5,887.5
2,Tottenham,8,5,2,1,19,10,9,17,2.13,W D W W L,61530,Harry Kane - 7,Hugo Lloris,,3,2022-23,1025.0,1237.5,875.0
3,Brighton,7,4,2,1,14,8,6,14,2.0,W W L W D,31230,Leandro Trossard - 5,Robert Sánchez,,4,2022-23,1014.285714,1200.0,885.714286
4,Chelsea,7,4,1,2,10,10,0,13,1.86,L W L W W,39941,Raheem Sterling - 3,Edouard Mendy,,5,2022-23,1014.285714,1142.857143,857.142857
5,Manchester Utd,7,4,0,3,11,14,-3,12,1.71,W W W W L,73763,Marcus Rashford - 3,David de Gea,,6,2022-23,1014.285714,1157.142857,800.0
6,Newcastle Utd,8,2,5,1,12,8,4,11,1.38,D L D D W,52151,"Miguel Almirón, Callum Wilson - 3",Nick Pope,,7,2022-23,950.0,1150.0,900.0
7,Fulham,8,3,2,3,13,15,-2,11,1.38,L W L W L,22286,Aleksandar Mitrović - 6,Bernd Leno,,8,2022-23,975.0,1162.5,812.5
8,Liverpool,7,2,4,1,18,9,9,10,1.43,L W W D D,53234,Roberto Firmino - 5,Alisson,,9,2022-23,957.142857,1257.142857,871.428571
9,Brentford,8,2,4,2,15,12,3,10,1.25,D D W L D,17069,Ivan Toney - 5,David Raya,,10,2022-23,950.0,1187.5,850.0


In [29]:
#Function to compute home and away overall strength, attack strength and defence strength
def calc_hw_team_strength(players):
    players['home_team_strength'] = 1000
    players['home_team_strength'] += (players['H_W'] / players['H_MP'])*100
    players['home_team_strength'] -= (players['H_D'] / players['H_MP'])*100
    players['home_team_strength'] -= (players['H_L'] / players['H_MP'])*100
    players['away_team_strength'] = 1000
    players['away_team_strength'] += (players['A_W'] / players['A_MP'])*100
    players['away_team_strength'] -= (players['A_D'] / players['A_MP'])*100
    players['away_team_strength'] -= (players['A_L'] / players['A_MP'])*100
    players['home_attack_strength'] = 1000 
    players['home_attack_strength'] += (players['H_GF'] / players['H_MP'])*100
    players['home_defence_strength'] = 1000 
    players['home_defence_strength'] -= (players['H_GA'] / players['H_MP'])*100
    players['away_attack_strength'] = 1000 
    players['away_attack_strength'] += (players['A_GF'] / players['A_MP'])*100
    players['away_defence_strength'] = 1000 
    players['away_defence_strength'] -= (players['A_GA'] / players['A_MP'])*100
    return players

In [30]:
league_home_away_2022_clean = calc_hw_team_strength(league_home_away_2022_clean)
league_home_away_2022_clean.head(20)

Unnamed: 0,Squad,H_MP,H_W,H_D,H_L,H_GF,H_GA,H_GD,H_Pts,H_Pts/MP,...,A_GD,A_Pts,A_Pts/MP,year,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength
0,Arsenal,4,4,0,0,11,5,6,12,3.0,...,6,9,2.25,2022-23,1100.0,1050.0,1275.0,875.0,1225.0,925.0
1,Manchester City,4,4,0,0,20,5,15,12,3.0,...,5,8,2.0,2022-23,1100.0,1000.0,1500.0,875.0,1225.0,900.0
2,Tottenham,4,4,0,0,13,4,9,12,3.0,...,0,5,1.25,2022-23,1100.0,950.0,1325.0,900.0,1150.0,850.0
3,Brighton,3,2,1,0,6,2,4,7,2.33,...,2,7,1.75,2022-23,1033.333333,1000.0,1200.0,933.333333,1200.0,850.0
4,Chelsea,3,2,1,0,6,4,2,7,2.33,...,-2,6,1.5,2022-23,1033.333333,1000.0,1200.0,866.666667,1100.0,850.0
5,Manchester Utd,3,2,0,1,6,4,2,6,2.0,...,-5,6,1.5,2022-23,1033.333333,1000.0,1200.0,866.666667,1125.0,750.0
6,Newcastle Utd,4,1,3,0,6,4,2,6,1.5,...,2,5,1.25,2022-23,950.0,950.0,1150.0,900.0,1150.0,900.0
7,Fulham,4,2,1,1,8,9,-1,7,1.75,...,-1,4,1.0,2022-23,1000.0,950.0,1200.0,775.0,1125.0,850.0
8,Liverpool,4,2,2,0,15,5,10,8,2.0,...,-1,2,0.67,2022-23,1000.0,900.0,1375.0,875.0,1100.0,866.666667
9,Brentford,4,2,1,1,10,6,4,7,1.75,...,-1,3,0.75,2022-23,1000.0,900.0,1250.0,850.0,1125.0,850.0


In [31]:
#Code to save the scraped data to csvs
#league_table_2022_clean.to_csv('/home/laniolao/fpl/FantasyPremierLeague/league_2022-2023_standings.csv')
#league_home_away_2022_clean.to_csv('/home/laniolao/fpl/FantasyPremierLeague/league_2022-2023_standings_home_away.csv')

To engineer two new features named club_name and position, we create dictionaries with the teams dataframe and map "team" and "element_type" to engineer the features.

In [32]:
#Map the team names and the player positions into the players_df_clean dataframe
teams_now=dict(zip(teams_df.id, teams_df.short_name))
positions=dict(zip(element_types_df.id, element_types_df.singular_name_short))
players_df_clean['club_name'] = players_df_clean['team'].map(teams_now)
players_df_clean['position'] = players_df_clean['element_type'].map(positions)

In [33]:
#Update the club names from abbreviations to full names
players_df_clean["club_name"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
players_df_clean.club_name

0      Arsenal
1      Arsenal
2      Arsenal
3      Arsenal
4      Arsenal
        ...   
633     Wolves
634     Wolves
635     Wolves
636     Wolves
637     Wolves
Name: club_name, Length: 638, dtype: object

In [34]:
#create the player name feature
players_df_clean['name'] = players_df_clean['first_name'] + ' ' + players_df_clean['second_name']

In [35]:
#Create season_x feature to align with the train data
df_currentseason_clean['season_x'] = df_currentseason_clean.apply(lambda x: "2022-23", axis=1)

In [36]:
df_currentseason_clean.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x'],
      dtype='object')

In [37]:
players_df_clean[['id', 'first_name', 'second_name', 'name','club_name', 'minutes', 'form',  'bonus', 'bps', 'total_points', 'value_season', 'value_form']].head()

Unnamed: 0,id,first_name,second_name,name,club_name,minutes,form,bonus,bps,total_points,value_season,value_form
0,1,Cédric,Alves Soares,Cédric Alves Soares,Arsenal,0,0.0,0,0,0,0.0,0.0
1,3,Granit,Xhaka,Granit Xhaka,Arsenal,717,7.0,4,166,40,7.8,1.4
2,4,Mohamed,Elneny,Mohamed Elneny,Arsenal,90,0.0,0,15,2,0.5,0.0
3,5,Rob,Holding,Rob Holding,Arsenal,3,0.0,0,8,2,0.5,0.0
4,6,Thomas,Partey,Thomas Partey,Arsenal,419,6.5,3,91,21,4.4,1.4


We have dataframes with all the players in the league in the current season (players_df) and the current season individual players performance (df_currentseason_clean). We proceed as described below:
1. Map team strength features from the league standing dataframes to players_df
2. Map the team names, player names and form into the all current season data player dataframe
3. Drop irrelevant column
4. Drop players that have not played any game this season
5. Rename the column 'round' to 'GW' to be similar with the train data set
6. Engineer column 'game_date' feature and format to appropriate dtype
7. Engineer game season weather feature.
8. Engineer feature to highlights early and late games based on start time
9. Engineer feature to highlight the game year only.



In [38]:
#Map the team names, player names and form into the all current season data player dataframe
teams_map=dict(zip(players_df_clean.id, players_df_clean.name))
club_map=dict(zip(players_df_clean.id, players_df_clean.club_name))
opp_teams_map=dict(zip(players_df_clean.team, players_df_clean.club_name))
form_map=dict(zip(players_df_clean.id, players_df_clean.form))
position_map=dict(zip(players_df_clean.id, players_df_clean.position))
league_table_2022_clean['squad_season'] = league_table_2022_clean['Squad'] + '_' + league_table_2022_clean['year']
league_home_away_2022_clean['squad_season'] = league_home_away_2022_clean['Squad'] + '_' + league_home_away_2022_clean['year']
stats_squad_2022_clean['squad_season'] = stats_squad_2022_clean['Squad'] + '_' + stats_squad_2022_clean['year']
df_currentseason_clean['name'] = df_currentseason_clean['element'].map(teams_map)
df_currentseason_clean['club_name'] = df_currentseason_clean['element'].map(club_map)
df_currentseason_clean['opp_team_name'] = df_currentseason_clean['opponent_team'].map(opp_teams_map)
df_currentseason_clean['form'] = df_currentseason_clean['element'].map(form_map)
df_currentseason_clean['position'] = df_currentseason_clean['element'].map(position_map)
df_currentseason_clean['team_season'] = df_currentseason_clean['club_name'] + '_' + df_currentseason_clean['season_x']


In [39]:
# Verify engineered features
df_currentseason_clean[['name', 'club_name', 'opp_team_name', 'form', 'position', 'team_season', 'minutes']].head(50)

Unnamed: 0,name,club_name,opp_team_name,form,position,team_season,minutes
0,Cédric Alves Soares,Arsenal,Crystal Palace,0.0,DEF,Arsenal_2022-23,0
1,Cédric Alves Soares,Arsenal,Leicester City,0.0,DEF,Arsenal_2022-23,0
2,Cédric Alves Soares,Arsenal,Bournemouth,0.0,DEF,Arsenal_2022-23,0
3,Cédric Alves Soares,Arsenal,Fulham,0.0,DEF,Arsenal_2022-23,0
4,Cédric Alves Soares,Arsenal,Aston Villa,0.0,DEF,Arsenal_2022-23,0
5,Cédric Alves Soares,Arsenal,Manchester Utd,0.0,DEF,Arsenal_2022-23,0
6,Cédric Alves Soares,Arsenal,Brentford,0.0,DEF,Arsenal_2022-23,0
7,Granit Xhaka,Arsenal,Crystal Palace,7.0,MID,Arsenal_2022-23,90
8,Granit Xhaka,Arsenal,Leicester City,7.0,MID,Arsenal_2022-23,90
9,Granit Xhaka,Arsenal,Bournemouth,7.0,MID,Arsenal_2022-23,87


In [40]:
# Map the overall strength and defence and attack strength of each team for respective season.
teamstrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.team_strength))
attackstrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.attack_strength))
defencestrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.defence_strength))

df_currentseason_clean['team_strength'] = df_currentseason_clean['team_season'].map(teamstrength)
df_currentseason_clean['attack_strength'] = df_currentseason_clean['team_season'].map(attackstrength)
df_currentseason_clean['defence_strength'] = df_currentseason_clean['team_season'].map(defencestrength)

In [41]:
# Map the home and away overall strength and defence and attack strength of each team for respective season.
hometeamstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_team_strength))
awayteamstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_team_strength))
homeattackstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_attack_strength))
homedefencestrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_defence_strength))
awayattackstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_attack_strength))
awaydefencestrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_defence_strength))


df_currentseason_clean['home_team_strength'] = df_currentseason_clean['team_season'].map(hometeamstrength)
df_currentseason_clean['away_team_strength'] = df_currentseason_clean['team_season'].map(awayteamstrength)
df_currentseason_clean['home_attack_strength'] = df_currentseason_clean['team_season'].map(homeattackstrength)
df_currentseason_clean['home_defence_strength'] = df_currentseason_clean['team_season'].map(homedefencestrength)
df_currentseason_clean['away_attack_strength'] = df_currentseason_clean['team_season'].map(awayattackstrength)
df_currentseason_clean['away_defence_strength'] = df_currentseason_clean['team_season'].map(awaydefencestrength)

In [42]:
# Map the squad average age of each team for respective season.
averageage=dict(zip(stats_squad_2022_clean.squad_season, stats_squad_2022_clean.Age))


df_currentseason_clean['squad_average_age'] = df_currentseason_clean['team_season'].map(averageage)

In [43]:
#Drop irrelevant feature
df_currentseason_clean.drop(['Unnamed: 0'], axis=1, inplace=True)

In [44]:
# Drop players that have not played any game this season from the df_currentseason dataframe
play_zero_minutes = df_currentseason_clean[df_currentseason_clean.minutes == 0].index
df_currentseason_clean.drop(play_zero_minutes, axis = 0, inplace=True)

In [45]:
#Rename column 'round' to 'GW' to be similar to train data
df_currentseason_clean.rename(columns= { 'round': 'GW' }, inplace=True)

In [46]:
#Preview kickoff_time column
df_currentseason_clean.kickoff_time

7       2022-08-05T19:00:00Z
8       2022-08-13T14:00:00Z
9       2022-08-20T16:30:00Z
10      2022-08-27T16:30:00Z
11      2022-08-31T18:30:00Z
                ...         
4012    2022-08-31T18:30:00Z
4013    2022-09-03T14:00:00Z
4014    2022-09-17T11:30:00Z
4015    2022-09-03T14:00:00Z
4018    2022-09-17T11:30:00Z
Name: kickoff_time, Length: 1979, dtype: object

In [47]:
# Feature Engineer column 'game_date' and format to appropriate dtype
df_currentseason_clean['game_date'] = df_currentseason_clean['kickoff_time'].str.replace('T', ' ')
df_currentseason_clean['game_date'] = df_currentseason_clean['game_date'].str.replace(':00Z', '')
df_currentseason_clean['game_date'] = pd.to_datetime(df_currentseason_clean['game_date'])
df_currentseason_clean.game_date

7      2022-08-05 19:00:00
8      2022-08-13 14:00:00
9      2022-08-20 16:30:00
10     2022-08-27 16:30:00
11     2022-08-31 18:30:00
               ...        
4012   2022-08-31 18:30:00
4013   2022-09-03 14:00:00
4014   2022-09-17 11:30:00
4015   2022-09-03 14:00:00
4018   2022-09-17 11:30:00
Name: game_date, Length: 1979, dtype: datetime64[ns]

In [48]:
# Engineer game season weather feature.
seasons_curr = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_curr_season = dict(zip(range(1,13), seasons_curr))
df_currentseason_clean['game_weather'] = df_currentseason_clean.game_date.dt.month.map(month_to_curr_season) 

In [49]:
# Data Quality Check
df_currentseason_clean.game_weather.value_counts()

3    1446
4     533
Name: game_weather, dtype: int64

In [50]:
# Engineer feature to highlights games that started before 13:00 (early starts) and those that started after 13:00 (late
df_currentseason_clean['start_label'] = np.where((df_currentseason_clean['game_date'].dt.hour) < 13, 0, 1)

In [51]:
# Quality Check
df_currentseason_clean.start_label.value_counts()

1    1769
0     210
Name: start_label, dtype: int64

In [52]:
df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1979 entries, 7 to 4018
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   element                1979 non-null   int64         
 1   fixture                1979 non-null   int64         
 2   opponent_team          1979 non-null   int64         
 3   total_points           1979 non-null   int64         
 4   was_home               1979 non-null   bool          
 5   kickoff_time           1979 non-null   object        
 6   team_h_score           1979 non-null   int64         
 7   team_a_score           1979 non-null   int64         
 8   GW                     1979 non-null   int64         
 9   minutes                1979 non-null   int64         
 10  goals_scored           1979 non-null   int64         
 11  assists                1979 non-null   int64         
 12  clean_sheets           1979 non-null   int64         
 13  goa

Data Quality
Data quality issues are mostly divided into four:

- Completeness: do we have all of the records that we should? Do we have missing records or not? Are there specific rows, columns, or cells missing?
- Validity: we have the records, but they're not valid, i.e., they don't conform to a defined schema. A schema is a defined set of rules for data. These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables).
- Accuracy: inaccurate data is wrong data that is valid. It adheres to the defined schema, but it is still incorrect.
- Consistency: inconsistent data is both valid and accurate, but there are multiple correct ways of referring to the same thing. Consistency, i.e., a standard format, in columns that represent the same data across tables and/or within tables is desired.

After assessing the data, we have the following issues:

1. Erroneous data types in the following data frames
- df_currentseason dataframe (form).
- fixtures_df_clean (kickoff_time)
2. Redundant features in all the following data frames
- df_currentseason dataframe (game_date, season_x, opponent_team, fixture, kickoff_time, element and name)
- players_df dataframe (id, squad_number)
- fixtures_df (stats, id)
3. Null values in the following data frames
- players_df dataframe (chance_of_playing_next_round, chance_of_playing_this_round, corners_and_indirect_freekicks_order, direct_freekicks_order, penalties_order)
- fixtures_df_clean (team_a_score, team_h_score)
## Data Tidiness
There are three main requirements for tidiness.

1. Each variable forms a column,
2. Each observation forms a row, and
3. Each type of observational unit forms a table.
The three above criteria's are fairly met by the dataset.

### CLEANING DATA

Issue #1:
- Erroneous data types in respective data frames

Define
- Change form feature to appropriate data type

Code

In [53]:
#Chnage 'form' feature dtype
df_currentseason_clean.form = df_currentseason_clean.form.astype(float)

fixtures_df_clean['kickoff_time'] = pd.to_datetime(fixtures_df_clean['kickoff_time'])

Issue #2:
- Redundant features in respective data frames

Define
- Create player_details data frame to save the name and total point features for prediction validation purpose
- Drop all identified redundant features in respective data frames

Code

In [54]:
#df_currentseason_clean.columns - consider moving
#player_details = df_currentseason_clean.filter(['name','total_points'], axis=1)

In [55]:
#Drop features no longer needed in the dataframe
df_currentseason_clean.drop(['season_x', 'opponent_team', 'fixture', 'kickoff_time', 'element'], axis=1, inplace=True)
players_df_clean.drop(['id', 'squad_number'], axis=1, inplace=True)
fixtures_df_clean.drop(['stats', 'id'], axis=1, inplace=True)

Issue #3:
- Null values in respective data frames

Define
- Fill all identified features in respective data frames

Code

In [56]:
#Cleaning the current season dataframes (null values, time series operations for dates)
players_df_clean.chance_of_playing_next_round = players_df_clean.chance_of_playing_next_round.fillna(100.0)
players_df_clean.chance_of_playing_this_round = players_df_clean.chance_of_playing_this_round.fillna(100.0)
players_df_clean.corners_and_indirect_freekicks_order = players_df_clean.corners_and_indirect_freekicks_order.fillna(0)
players_df_clean.direct_freekicks_order = players_df_clean.direct_freekicks_order.fillna(0)
players_df_clean.penalties_order = players_df_clean.penalties_order.fillna(0)
fixtures_df_clean.team_a_score = fixtures_df_clean.team_a_score.fillna(0)
fixtures_df_clean.team_h_score = fixtures_df_clean.team_h_score.fillna(0)

In [57]:
#Assign year as index
df_currentseason_clean.set_index('game_date', inplace=True)

df_currentseason_clean

Unnamed: 0_level_0,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,goals_conceded,...,defence_strength,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,squad_average_age,game_weather,start_label
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-08-05 19:00:00,2,False,0,2,1,90,0,0,1,0,...,900.0,1100.0,1050.0,1275.0,875.0,1225.0,925.0,24.5,3,1
2022-08-13 14:00:00,12,True,4,2,2,90,1,1,0,2,...,900.0,1100.0,1050.0,1275.0,875.0,1225.0,925.0,24.5,3,1
2022-08-20 16:30:00,6,False,0,3,3,87,0,1,1,0,...,900.0,1100.0,1050.0,1275.0,875.0,1225.0,925.0,24.5,3,1
2022-08-27 16:30:00,2,True,2,1,4,90,0,0,0,1,...,900.0,1100.0,1050.0,1275.0,875.0,1225.0,925.0,24.5,3,1
2022-08-31 18:30:00,2,True,2,1,5,90,0,0,0,1,...,900.0,1100.0,1050.0,1275.0,875.0,1225.0,925.0,24.5,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-31 18:30:00,3,False,0,0,5,74,0,0,1,0,...,887.5,950.0,900.0,1050.0,900.0,1025.0,875.0,26.4,3,1
2022-09-03 14:00:00,9,True,1,0,6,90,0,1,1,0,...,887.5,950.0,900.0,1050.0,900.0,1025.0,875.0,26.4,4,1
2022-09-17 11:30:00,1,True,0,3,8,90,0,0,0,3,...,887.5,950.0,900.0,1050.0,900.0,1025.0,875.0,26.4,4,0
2022-09-03 14:00:00,1,True,1,0,6,45,0,0,0,0,...,887.5,950.0,900.0,1050.0,900.0,1025.0,875.0,26.4,4,1


In [58]:
df_test = df_currentseason_clean.drop(['bps', 'total_points', 'transfers_balance', 'team_season', 'team_a_score', 'team_h_score', 'selected', 'value', 'name'],  axis = 1)

In [59]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1979 entries, 2022-08-05 19:00:00 to 2022-09-17 11:30:00
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   was_home               1979 non-null   bool   
 1   GW                     1979 non-null   int64  
 2   minutes                1979 non-null   int64  
 3   goals_scored           1979 non-null   int64  
 4   assists                1979 non-null   int64  
 5   clean_sheets           1979 non-null   int64  
 6   goals_conceded         1979 non-null   int64  
 7   own_goals              1979 non-null   int64  
 8   penalties_saved        1979 non-null   int64  
 9   penalties_missed       1979 non-null   int64  
 10  yellow_cards           1979 non-null   int64  
 11  red_cards              1979 non-null   int64  
 12  saves                  1979 non-null   int64  
 13  bonus                  1979 non-null   int64  
 14  influence           

In [60]:
# Convert dataframe to a dictionary.
df_test_dict = df_test.to_dict(orient='records')

In [61]:
with open('./model/dv', 'rb') as f_in1:
    dv = pickle.load(f_in1)

In [62]:
# Transform data.
test_encoded = dv.transform(df_test_dict)

In [63]:
# vocabulary
vocab = dv.vocabulary_

# show vocab
vocab

{'position=GK': 86,
 'assists': 1,
 'bonus': 6,
 'clean_sheets': 7,
 'creativity': 38,
 'goals_conceded': 42,
 'goals_scored': 43,
 'ict_index': 47,
 'influence': 48,
 'minutes': 49,
 'opp_team_name=Leicester': 64,
 'own_goals': 81,
 'penalties_missed': 82,
 'penalties_saved': 83,
 'red_cards': 88,
 'saves': 89,
 'threat': 132,
 'transfers_in': 133,
 'transfers_out': 134,
 'was_home': 135,
 'yellow_cards': 136,
 'GW': 0,
 'club_name=Hull City': 20,
 'form': 40,
 'team_strength': 131,
 'attack_strength': 2,
 'defence_strength': 39,
 'home_team_strength': 46,
 'away_team_strength': 5,
 'home_attack_strength': 44,
 'home_defence_strength': 45,
 'away_attack_strength': 3,
 'away_defence_strength': 4,
 'squad_average_age=27.4': 114,
 'game_weather': 41,
 'start_label': 130,
 'position=DEF': 84,
 'opp_team_name=Hull': 62,
 'club_name=Leicester City': 22,
 'squad_average_age=27.8': 118,
 'position=MID': 87,
 'club_name=West Brom': 35,
 'squad_average_age=29.4': 129,
 'position=FWD': 85,
 'clu

In [64]:
# Convert array returned from dictvectorizer to a dataframe.
test_transformed = pd.DataFrame(test_encoded, columns=dv.feature_names_)

test_transformed.head()

Unnamed: 0,GW,assists,attack_strength,away_attack_strength,away_defence_strength,away_team_strength,bonus,clean_sheets,club_name=Arsenal,club_name=Aston Villa,...,squad_average_age=29.0,squad_average_age=29.1,squad_average_age=29.4,start_label,team_strength,threat,transfers_in,transfers_out,was_home,yellow_cards
0,1.0,0.0,1250.0,1225.0,925.0,1050.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1075.0,2.0,0.0,0.0,0.0,1.0
1,2.0,1.0,1250.0,1225.0,925.0,1050.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1075.0,28.0,9001.0,9630.0,1.0,0.0
2,3.0,1.0,1250.0,1225.0,925.0,1050.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1075.0,6.0,137326.0,25286.0,0.0,0.0
3,4.0,0.0,1250.0,1225.0,925.0,1050.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1075.0,12.0,77459.0,34699.0,1.0,0.0
4,5.0,0.0,1250.0,1225.0,925.0,1050.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1075.0,8.0,49435.0,38654.0,1.0,0.0


In [65]:
# Check the shape of the dataframe.
test_transformed.shape

# Read in scaler.
with open('./model/min_max_scaler', 'rb') as f_in2:
    scaler = pickle.load(f_in2)

In [66]:
# Transform test data.
test_norm = scaler.transform(test_transformed)

In [67]:
# Read in model.
with open('./model/rf_model.pkl', 'rb') as f_in3:
    model = pickle.load(f_in3)

In [68]:
# Utility function
def evaluate_model(model, x, y):
    """
    Utility function to print the model performance, (RMSE and R-Squared scores)
    model: Fitted model
    x: cross validation features dataset
    y: cross validation target values
    """
    predicted = model.predict(x) #get predictions
    RSME_score = mean_squared_error(y_true=y, y_pred=predicted, squared=False) #squared=False will RMSE instead of MSE
    R2_score = r2_score(y, predicted)
    
    print('RMSE:', RSME_score)
    print('R-Squared:', R2_score)
    print()

In [69]:
evaluate_model(model, test_norm, df_currentseason_clean['total_points'])

RMSE: 0.47477411453047447
R-Squared: 0.9742642822605885



In [70]:

df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1979 entries, 2022-08-05 19:00:00 to 2022-09-17 11:30:00
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_points           1979 non-null   int64  
 1   was_home               1979 non-null   bool   
 2   team_h_score           1979 non-null   int64  
 3   team_a_score           1979 non-null   int64  
 4   GW                     1979 non-null   int64  
 5   minutes                1979 non-null   int64  
 6   goals_scored           1979 non-null   int64  
 7   assists                1979 non-null   int64  
 8   clean_sheets           1979 non-null   int64  
 9   goals_conceded         1979 non-null   int64  
 10  own_goals              1979 non-null   int64  
 11  penalties_saved        1979 non-null   int64  
 12  penalties_missed       1979 non-null   int64  
 13  yellow_cards           1979 non-null   int64  
 14  red_cards           

In [71]:
fixtures_df_clean.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'kickoff_time',
       'minutes', 'provisional_start_time', 'started', 'team_a',
       'team_a_score', 'team_h', 'team_h_score', 'team_h_difficulty',
       'team_a_difficulty', 'pulse_id'],
      dtype='object')

In [72]:
#Create Next Game week fixtures
GWfixtures_df = fixtures_df_clean.loc[(fixtures_df_clean['event'] == 9.0)].copy()
GWfixtures_df.shape

(10, 15)

##To do

Code above needs to be updated to use date time to identify the next game week fixtures as opposed to current use of gameweek event

In [73]:
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id
81,2292891,9.0,True,True,2022-10-01 11:30:00+00:00,90,False,True,18,1.0,1,3.0,3,4,74991
82,2292890,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,4,0.0,3,0.0,2,2,74992
83,2292892,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,6,2.0,7,1.0,3,3,74993
84,2292893,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,15,4.0,9,1.0,3,2,74994
85,2292896,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,5,3.0,12,3.0,3,5,74997
86,2292898,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,8,2.0,17,1.0,2,2,74999
87,2292899,9.0,True,True,2022-10-01 16:30:00+00:00,90,False,True,20,0.0,19,2.0,2,3,75000
88,2292897,9.0,True,True,2022-10-02 13:00:00+00:00,90,False,True,14,3.0,13,6.0,3,5,74998
89,2292894,9.0,True,True,2022-10-02 15:30:00+00:00,90,False,True,2,0.0,11,0.0,2,2,74995
90,2292895,9.0,True,True,2022-10-03 19:00:00+00:00,90,False,True,16,0.0,10,4.0,2,2,74996


In [74]:
#Mapping away and home teams using dictionary created earlier
GWfixtures_df['away_team'] = GWfixtures_df['team_a'].map(teams_now)
GWfixtures_df['home_team'] = GWfixtures_df['team_h'].map(teams_now)
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team
81,2292891,9.0,True,True,2022-10-01 11:30:00+00:00,90,False,True,18,1.0,1,3.0,3,4,74991,TOT,ARS
82,2292890,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,4,0.0,3,0.0,2,2,74992,BRE,BOU
83,2292892,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,6,2.0,7,1.0,3,3,74993,CHE,CRY
84,2292893,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,15,4.0,9,1.0,3,2,74994,NEW,FUL
85,2292896,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,5,3.0,12,3.0,3,5,74997,BHA,LIV
86,2292898,9.0,True,True,2022-10-01 14:00:00+00:00,90,False,True,8,2.0,17,1.0,2,2,74999,EVE,SOU
87,2292899,9.0,True,True,2022-10-01 16:30:00+00:00,90,False,True,20,0.0,19,2.0,2,3,75000,WOL,WHU
88,2292897,9.0,True,True,2022-10-02 13:00:00+00:00,90,False,True,14,3.0,13,6.0,3,5,74998,MUN,MCI
89,2292894,9.0,True,True,2022-10-02 15:30:00+00:00,90,False,True,2,0.0,11,0.0,2,2,74995,AVL,LEE
90,2292895,9.0,True,True,2022-10-03 19:00:00+00:00,90,False,True,16,0.0,10,4.0,2,2,74996,NFO,LEI


In [75]:
#Update the away team name from abbreviations to full names
GWfixtures_df["away_team"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
GWfixtures_df.away_team

81          Tottenham
82          Brentford
83            Chelsea
84      Newcastle Utd
85           Brighton
86            Everton
87             Wolves
88     Manchester Utd
89        Aston Villa
90    Nott'ham Forest
Name: away_team, dtype: object

In [76]:
#Update the home team name from abbreviations to full names
GWfixtures_df["home_team"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
GWfixtures_df.home_team

81            Arsenal
82        Bournemouth
83     Crystal Palace
84             Fulham
85          Liverpool
86        Southampton
87           West Ham
88    Manchester City
89       Leeds United
90     Leicester City
Name: home_team, dtype: object

In [77]:
#Create features for data analysis (player-opposition team, game week fixture difficulty index, player's club )
gw_away_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_a"])
gw_away_players['player_opp'] = gw_away_players['web_name'].map(str) + '-' + gw_away_players['home_team'].map(str)
gw_home_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_h"])
gw_home_players['player_opp'] = gw_home_players['web_name'].map(str) + '-' + gw_home_players['away_team'].map(str)
players_df_clean = gw_away_players.append(gw_home_players)
players_df_clean['player_club'] = players_df_clean['web_name'].map(str) + '-' + players_df_clean['club_name'].map(str)
players_df_clean.drop(['minutes_y'], axis=1, inplace=True)
players_df_clean.rename(columns = {'minutes_x':'minutes'}, inplace = True)
players_df_clean.columns

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round',
       'code_x', 'cost_change_event', 'cost_change_event_fall',
       'cost_change_start', 'cost_change_start_fall', 'dreamteam_count',
       'element_type', 'ep_next', 'ep_this', 'event_points', 'first_name',
       'form', 'in_dreamteam', 'news', 'news_added', 'now_cost', 'photo',
       'points_per_game', 'second_name', 'selected_by_percent', 'special',
       'status', 'team', 'team_code', 'total_points', 'transfers_in',
       'transfers_in_event', 'transfers_out', 'transfers_out_event',
       'value_form', 'value_season', 'web_name', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'influence_rank', 'influence_rank_type', 'creativity_rank',
       'creativity_rank_type', 'threat_rank', 'threat_rank_type'

In [78]:
# Map the games played so far by each team for respective season to use for averaging
matches_played_map=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.MP))

players_df_clean['MP'] = players_df_clean['club_name'].map(matches_played_map)

In [79]:
#Updating `players_df_clean` with all features necessary for prediction

players_df_clean["position"].replace({'GKP': 'GK'}, inplace=True)
players_df_clean["assist_ave"] = players_df_clean['assists'] / players_df_clean['MP']
players_df_clean["bonus_ave"] = players_df_clean['bonus'] / players_df_clean['MP']
players_df_clean["clean_sheets_ave"] = players_df_clean['clean_sheets'] / players_df_clean['MP']
players_df_clean["goals_conceded_ave"] = players_df_clean['goals_conceded'] / players_df_clean['MP']
players_df_clean["goals_scored_ave"] = players_df_clean['goals_scored'] / players_df_clean['MP']
players_df_clean["minutes_ave"] = players_df_clean['minutes'] / players_df_clean['MP']
players_df_clean['opp_team_name'] = players_df_clean.apply(lambda x: x['home_team'] if x['home_team'] != x['club_name'] else x['away_team'], axis=1)
players_df_clean["own_goals_ave"] = players_df_clean['own_goals'] / players_df_clean['MP']
players_df_clean["penalties_missed_ave"] = players_df_clean['penalties_missed'] / players_df_clean['MP']
players_df_clean["penalties_saved_ave"] = players_df_clean['penalties_saved'] / players_df_clean['MP']
players_df_clean["red_cards_ave"] = players_df_clean['red_cards'] / players_df_clean['MP']
players_df_clean["saves_ave"] = players_df_clean['saves'] / players_df_clean['MP']
players_df_clean["yellow_cards_ave"] = players_df_clean['yellow_cards'] / players_df_clean['MP']
players_df_clean['was_home'] = players_df_clean.apply(lambda x: 'True' if x['home_team'] == x['club_name'] else 'False', axis=1)
players_df_clean.rename(columns = {'event':'GW'}, inplace = True)


In [80]:
players_df_clean.kickoff_time.value_counts()

2022-10-01 14:00:00+00:00    323
2022-10-03 19:00:00+00:00     68
2022-10-02 15:30:00+00:00     65
2022-10-01 11:30:00+00:00     63
2022-10-02 13:00:00+00:00     62
2022-10-01 16:30:00+00:00     57
Name: kickoff_time, dtype: int64

In [81]:
# Engineer feature to highlights games that started before 13:00 (early starts) and those that started after 13:00 (late starts)
players_df_clean['start_label'] = np.where((players_df_clean['kickoff_time'].dt.hour) < 13, 0, 1)

In [82]:
# Quality Check.
players_df_clean[['kickoff_time', 'start_label']].value_counts()

kickoff_time               start_label
2022-10-01 14:00:00+00:00  1              323
2022-10-03 19:00:00+00:00  1               68
2022-10-02 15:30:00+00:00  1               65
2022-10-01 11:30:00+00:00  0               63
2022-10-02 13:00:00+00:00  1               62
2022-10-01 16:30:00+00:00  1               57
dtype: int64

In [83]:
# Engineer game season weather feature.
seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_season = dict(zip(range(1,13), seasons))
players_df_clean['game_weather'] = players_df_clean.kickoff_time.dt.month.map(month_to_season) 

In [84]:
# Data Quality Check.
players_df_clean.game_weather.value_counts()

4    638
Name: game_weather, dtype: int64

In [85]:
# Map the overall strength and defence and attack strength of each team for respective season.
teamstrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.team_strength))
attackstrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.attack_strength))
defencestrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.defence_strength))

players_df_clean['team_strength'] = players_df_clean['club_name'].map(teamstrength1)
players_df_clean['attack_strength'] = players_df_clean['club_name'].map(attackstrength1)
players_df_clean['defence_strength'] = players_df_clean['club_name'].map(defencestrength1)

In [86]:
# Map the home and away overall strength and defence and attack strength of each team for respective season.
hometeamstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_team_strength))
awayteamstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_team_strength))
homeattackstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_attack_strength))
homedefencestrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_defence_strength))
awayattackstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_attack_strength))
awaydefencestrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_defence_strength))


players_df_clean['home_team_strength'] = players_df_clean['club_name'].map(hometeamstrength1)
players_df_clean['away_team_strength'] = players_df_clean['club_name'].map(awayteamstrength1)
players_df_clean['home_attack_strength'] = players_df_clean['club_name'].map(homeattackstrength1)
players_df_clean['home_defence_strength'] = players_df_clean['club_name'].map(homedefencestrength1)
players_df_clean['away_attack_strength'] = players_df_clean['club_name'].map(awayattackstrength1)
players_df_clean['away_defence_strength'] = players_df_clean['club_name'].map(awaydefencestrength1)

## To consider 

We need to figure out how to engineer each clubs performance against different opponents historically

In [87]:
# Map the squad average age of each team for respective season.
averageage1=dict(zip(stats_squad_2022_clean.Squad, stats_squad_2022_clean.Age))


players_df_clean['squad_average_age'] = players_df_clean['club_name'].map(averageage1)

In [88]:
#Quality Check
players_df_clean[['name', 'MP','club_name', 'squad_average_age', 'away_team_strength', 'away_attack_strength', 'away_defence_strength', 'bonus']].sort_values(by='bonus', ascending=False).head(20)

Unnamed: 0,name,MP,club_name,squad_average_age,away_team_strength,away_attack_strength,away_defence_strength,bonus
238,Erling Haaland,8,Manchester City,27.5,1000.0,1225.0,900.0,15
269,Harry Kane,8,Tottenham,27.8,950.0,1150.0,850.0,10
78,Alexis Mac Allister,7,Brighton,27.6,1000.0,1200.0,850.0,9
23,William Saliba,8,Arsenal,24.5,1050.0,1225.0,925.0,9
192,Roberto Firmino,7,Liverpool,28.0,900.0,1100.0,866.666667,8
35,Ivan Toney,8,Brentford,26.2,900.0,1125.0,850.0,8
227,João Cancelo,8,Manchester City,27.5,1000.0,1225.0,900.0,7
103,Aleksandar Mitrović,8,Fulham,28.3,950.0,1125.0,850.0,7
69,Pascal Groß,7,Brighton,27.6,1000.0,1200.0,850.0,6
181,Diogo Dalot Teixeira,7,Manchester Utd,26.9,1000.0,1125.0,750.0,6


In [89]:
players_df_clean = players_df_clean.query('chance_of_playing_this_round > 0')

In [90]:
#Change data type for Gameweek
df_currentseason_clean['GW'] = df_currentseason_clean.GW.astype(float)
df_currentseason_clean['GW'] = df_currentseason_clean.GW.astype(str)
players_df_clean['GW'] = players_df_clean.GW.astype(str)

In [91]:
# Create feature to use for mapping Gameweek total points for validation
df_currentseason_clean['name_GW'] = df_currentseason_clean['name'] + '_' + df_currentseason_clean['GW']
players_df_clean['name_GW'] = players_df_clean['name'] + '_' + players_df_clean['GW']

In [92]:
df_currentseason_clean.name_GW

game_date
2022-08-05 19:00:00          Granit Xhaka_1.0
2022-08-13 14:00:00          Granit Xhaka_2.0
2022-08-20 16:30:00          Granit Xhaka_3.0
2022-08-27 16:30:00          Granit Xhaka_4.0
2022-08-31 18:30:00          Granit Xhaka_5.0
                                ...          
2022-08-31 18:30:00    Matheus Luiz Nunes_5.0
2022-09-03 14:00:00    Matheus Luiz Nunes_6.0
2022-09-17 11:30:00    Matheus Luiz Nunes_8.0
2022-09-03 14:00:00        Sasa Kalajdzic_6.0
2022-09-17 11:30:00       Boubacar Traoré_8.0
Name: name_GW, Length: 1979, dtype: object

In [93]:
# Map the total points for each player per gameweek
total_pointsmap=dict(zip(df_currentseason_clean.name_GW, df_currentseason_clean.total_points))


players_df_clean['GW_point'] = players_df_clean['name_GW'].map(total_pointsmap)

In [94]:
players_df_clean.GW_point.isna().sum()

490

In [97]:
fixtures_df_clean

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id
0,2292871,,False,False,NaT,0,False,,8,0.0,1,0.0,2,4,74971
1,2292921,,False,False,NaT,0,False,,13,0.0,1,0.0,5,4,75021
2,2292870,,False,False,NaT,0,False,,5,0.0,3,0.0,3,2,74972
3,2292882,,False,False,NaT,0,False,,7,0.0,5,0.0,2,3,74983
4,2292883,,False,False,NaT,0,False,,12,0.0,6,0.0,4,4,74984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2293185,38.0,False,False,2023-05-28 15:00:00+00:00,0,False,False,3,0.0,8,0.0,2,2,75286
376,2293186,38.0,False,False,2023-05-28 15:00:00+00:00,0,False,False,18,0.0,11,0.0,3,2,75287
377,2293187,38.0,False,False,2023-05-28 15:00:00+00:00,0,False,False,19,0.0,10,0.0,2,2,75288
378,2293188,38.0,False,False,2023-05-28 15:00:00+00:00,0,False,False,9,0.0,14,0.0,2,4,75289


In [95]:
players_df_clean = players_df_clean.dropna(subset=['GW_point'])

In [96]:
players_df_clean[['name_GW', 'GW_point', 'chance_of_playing_next_round', 'chance_of_playing_this_round' ]].head(50)

Unnamed: 0,name_GW,GW_point,chance_of_playing_next_round,chance_of_playing_this_round


In [107]:
players_df_clean[['name', 'total_points']].head(20)

Unnamed: 0,name,total_points
1,Granit Xhaka,31
4,Thomas Partey,11
6,Kieran Tierney,13
8,Benjamin White,29
9,Eddie Nketiah,7
11,Bukayo Saka,37
12,Takehiro Tomiyasu,5
13,Aaron Ramsdale,25
14,Gabriel dos Santos Magalhães,33
16,Albert Sambi Lokonga,7


In [108]:
players_df_clean.chance_of_playing_this_round.unique()

array([100.,  75.,  50.])

## To do
Drop players with zero chance of playing??

In [109]:

prediction_df = players_df_clean[['position','assist_ave',
       'bonus_ave', 'clean_sheets_ave', 'creativity', 'ict_index', 'influence', 'goals_conceded_ave',
       'goals_scored_ave', 'minutes_ave', 'opp_team_name',
       'own_goals_ave', 'penalties_missed_ave', 'penalties_saved_ave',
       'red_cards_ave', 'saves_ave', 'yellow_cards_ave', 'transfers_in', 'transfers_out', 'threat', 'was_home',
       'start_label', 'GW', 'club_name', 'form', 'game_weather', 'team_strength', 'attack_strength',
       'defence_strength', 'home_team_strength', 'away_team_strength',
       'home_attack_strength', 'home_defence_strength',
       'away_attack_strength', 'away_defence_strength',
       'squad_average_age']]


In [110]:
prediction_df.rename(columns={'own_goals_ave': 'own_goals', 'assist_ave': 'assists', 'bonus_ave': 'bonus', 'clean_sheets_ave':'clean_sheets', 'goals_conceded_ave':'goals_conceded', 'goals_scored_ave':'goals_scored', 'minutes_ave':'minutes', 'penalties_missed_ave': 'penalties_missed', 'penalties_saved_ave':'penalties_saved', 'red_cards_ave':'red_cards', 'saves_ave':'saves', 'yellow_cards_ave':'yellow_cards'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [111]:
prediction_df.head()

Unnamed: 0,position,assists,bonus,clean_sheets,creativity,ict_index,influence,goals_conceded,goals_scored,minutes,...,team_strength,attack_strength,defence_strength,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,squad_average_age
1,MID,0.428571,0.285714,0.428571,175.6,43.9,158.2,1.0,0.142857,89.571429,...,1071.428571,1242.857143,900.0,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0,24.5
4,MID,0.0,0.0,0.428571,28.4,12.9,51.4,0.285714,0.0,49.571429,...,1071.428571,1242.857143,900.0,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0,24.5
6,DEF,0.0,0.0,0.142857,33.9,10.9,43.8,0.285714,0.0,37.428571,...,1071.428571,1242.857143,900.0,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0,24.5
8,DEF,0.0,0.428571,0.571429,70.9,17.5,94.4,0.857143,0.0,79.428571,...,1071.428571,1242.857143,900.0,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0,24.5
9,FWD,0.0,0.0,0.0,23.5,14.5,19.0,0.142857,0.0,12.571429,...,1071.428571,1242.857143,900.0,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0,24.5


In [112]:
prediction_df.columns

Index(['position', 'assists', 'bonus', 'clean_sheets', 'creativity',
       'ict_index', 'influence', 'goals_conceded', 'goals_scored', 'minutes',
       'opp_team_name', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'saves', 'yellow_cards', 'transfers_in', 'transfers_out',
       'threat', 'was_home', 'start_label', 'GW', 'club_name', 'form',
       'game_weather', 'team_strength', 'attack_strength', 'defence_strength',
       'home_team_strength', 'away_team_strength', 'home_attack_strength',
       'home_defence_strength', 'away_attack_strength',
       'away_defence_strength', 'squad_average_age'],
      dtype='object')

In [113]:
# Convert dataframe to a dictionary.
prediction_df_dict = prediction_df.to_dict(orient='records')

In [114]:
# Transform data.
prediction_encoded = dv.transform(prediction_df_dict)


In [115]:
# Convert array returned from dictvectorizer to a dataframe.
prediction_transformed = pd.DataFrame(prediction_encoded, columns=dv.feature_names_)

prediction_transformed.head()

Unnamed: 0,GW,assists,attack_strength,away_attack_strength,away_defence_strength,away_team_strength,bonus,clean_sheets,club_name=Arsenal,club_name=Aston Villa,...,squad_average_age=29.0,squad_average_age=29.1,squad_average_age=29.4,start_label,team_strength,threat,transfers_in,transfers_out,was_home,yellow_cards
0,0.0,0.428571,1242.857143,1225.0,925.0,1050.0,0.285714,0.428571,1.0,0.0,...,0.0,0.0,0.0,0.0,1071.428571,0.0,384388.0,218943.0,0.0,0.285714
1,0.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,0.428571,1.0,0.0,...,0.0,0.0,0.0,0.0,1071.428571,0.0,26533.0,67006.0,0.0,0.0
2,0.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,0.142857,1.0,0.0,...,0.0,0.0,0.0,0.0,1071.428571,0.0,38602.0,86281.0,0.0,0.0
3,0.0,0.0,1242.857143,1225.0,925.0,1050.0,0.428571,0.571429,1.0,0.0,...,0.0,0.0,0.0,0.0,1071.428571,0.0,246797.0,314532.0,0.0,0.285714
4,0.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1071.428571,0.0,12913.0,66292.0,0.0,0.0


In [116]:
# Check the shape of the dataframe.
prediction_transformed.shape

(211, 137)

In [117]:
# Transform test data.
prediction_norm = scaler.transform(prediction_transformed)

In [118]:
predicted_New = model.predict(prediction_norm)

In [119]:
players_df_clean.GW

1      8.0
4      8.0
6      8.0
8      8.0
9      8.0
      ... 
219    8.0
224    8.0
225    8.0
226    8.0
229    8.0
Name: GW, Length: 211, dtype: object

In [120]:
player_details = players_df_clean[['name', 'name_GW', 'GW_point']].loc[(players_df_clean['GW']=='8.0')]

In [121]:
player_details.head(20)

Unnamed: 0,name,name_GW,GW_point
1,Granit Xhaka,Granit Xhaka_8.0,5.0
4,Thomas Partey,Thomas Partey_8.0,3.0
6,Kieran Tierney,Kieran Tierney_8.0,6.0
8,Benjamin White,Benjamin White_8.0,6.0
9,Eddie Nketiah,Eddie Nketiah_8.0,1.0
11,Bukayo Saka,Bukayo Saka_8.0,9.0
12,Takehiro Tomiyasu,Takehiro Tomiyasu_8.0,1.0
13,Aaron Ramsdale,Aaron Ramsdale_8.0,6.0
14,Gabriel dos Santos Magalhães,Gabriel dos Santos Magalhães_8.0,6.0
16,Albert Sambi Lokonga,Albert Sambi Lokonga_8.0,1.0


In [122]:
player_details = player_details.reset_index()

In [123]:
player_details.shape

(211, 4)

In [124]:
df_predicted = pd.Series(predicted_New)

In [125]:
df_predicted

0      2.00
1      1.00
2      1.00
3      5.28
4      1.00
       ... 
206    2.88
207    1.00
208    1.00
209    2.00
210    1.00
Length: 211, dtype: float64

In [126]:
df_all = pd.concat([player_details, df_predicted], axis=1)
df_all.head()

Unnamed: 0,index,name,name_GW,GW_point,0
0,1,Granit Xhaka,Granit Xhaka_8.0,5.0,2.0
1,4,Thomas Partey,Thomas Partey_8.0,3.0,1.0
2,6,Kieran Tierney,Kieran Tierney_8.0,6.0,1.0
3,8,Benjamin White,Benjamin White_8.0,6.0,5.28
4,9,Eddie Nketiah,Eddie Nketiah_8.0,1.0,1.0


In [133]:
df_all.tail(30)

Unnamed: 0,index,name,name_GW,GW_point,0
181,168,Hugo Lloris,Hugo Lloris_8.0,2.0,3.01
182,170,Harry Kane,Harry Kane_8.0,10.0,7.0
183,171,Son Heung-min,Son Heung-min_8.0,19.0,1.98
184,173,Eric Dier,Eric Dier_8.0,7.0,2.0
185,176,Pierre-Emile Højbjerg,Pierre-Emile Højbjerg_8.0,5.0,2.81
186,178,Davinson Sánchez,Davinson Sánchez_8.0,0.0,1.0
187,179,Ryan Sessegnon,Ryan Sessegnon_8.0,1.0,1.0
188,183,Rodrigo Bentancur,Rodrigo Bentancur_8.0,12.0,1.98
189,186,Cristian Romero,Cristian Romero_8.0,1.0,1.0
190,187,Yves Bissouma,Yves Bissouma_8.0,1.0,1.0


In [131]:
# Evaluate model.
RSME_score = mean_squared_error(y_true=df_all['GW_point'], y_pred=df_predicted, squared=False) #squared=False will RMSE instead of MSE
R2_score = r2_score(df_all['GW_point'], df_predicted)

print('RMSE:', RSME_score)
print('R-Squared:', R2_score)
print()

RMSE: 3.0486367087586084
R-Squared: -0.019424504437031365



In [128]:
df_all.head(20)

Unnamed: 0,index,name,name_GW,GW_point,0
1,1,Granit Xhaka,Granit Xhaka_8.0,5.0,2.0
4,4,Thomas Partey,Thomas Partey_8.0,3.0,1.0
6,6,Kieran Tierney,Kieran Tierney_8.0,6.0,1.0
8,8,Benjamin White,Benjamin White_8.0,6.0,5.28
9,9,Eddie Nketiah,Eddie Nketiah_8.0,1.0,1.0
11,11,Bukayo Saka,Bukayo Saka_8.0,9.0,4.9
12,12,Takehiro Tomiyasu,Takehiro Tomiyasu_8.0,1.0,1.0
13,13,Aaron Ramsdale,Aaron Ramsdale_8.0,6.0,2.0
14,14,Gabriel dos Santos Magalhães,Gabriel dos Santos Magalhães_8.0,6.0,2.0
16,16,Albert Sambi Lokonga,Albert Sambi Lokonga_8.0,1.0,1.0
