### ACCESSING DATA - TEST DATA 

#### Data Quality and Tidiness Issues

In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import pickle
from sklearn.metrics import r2_score, mean_squared_error
#get current season data from FPL API endpoints and identify the keys
fpl_base_url = 'https://fantasy.premierleague.com/api/'
current_season = requests.get(fpl_base_url+'bootstrap-static/').json()
#json = r.json()
current_season.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [2]:
#create dataframes for the current season dictionary keys for data exploration
#- Contains summary of Gameweek data
events_df = pd.DataFrame(current_season['events']) #
phases_df = pd.DataFrame(current_season['phases']) #Shows calendar months for game weeks
teams_df = pd.DataFrame(current_season['teams'])
players_df = pd.DataFrame(current_season['elements'])
element_stats_df = pd.DataFrame(current_season['element_stats'])
element_types_df = pd.DataFrame(current_season['element_types'])

#Code to extract weekly game data for all active players in current season
for x in players_df.index :
    print(x)
    player_id = players_df.id[x]
    url = f'https://fantasy.premierleague.com/api/element-summary/{player_id}/'
    r = requests.get(url)
    json = r.json()
    json_history_df = pd.DataFrame(json['history'])
    json_history_past_df = pd.DataFrame(json['history_past'])

       
    if x == 0 :
        all_history_df = json_history_df
        #all_history_past_df = json_history_past_df
    else : 
        all_history_df = all_history_df.append(json_history_df)
        #all_history_past_df = all_history_past_df.append(json_history_past_df)

In [3]:
#Code to save the all players game week data in current and past seasons to csvs
#all_history_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/current_season.csv')
#all_history_past_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/past_seasons.csv')

In [4]:
# Read Data
df_currentseason = pd.read_csv('current_season.csv')
df_currentseason.head()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
0,0,1,1,7,0,False,2022-08-05T19:00:00Z,0,2,1,...,0,0.0,0.0,0.0,0.0,45,0,23970,0,0
1,1,1,11,10,0,True,2022-08-13T14:00:00Z,4,2,2,...,0,0.0,0.0,0.0,0.0,44,-5169,24193,1361,6530
2,2,1,21,3,0,False,2022-08-20T16:30:00Z,0,3,3,...,0,0.0,0.0,0.0,0.0,44,-4337,20960,879,5216
3,3,1,31,9,0,True,2022-08-27T16:30:00Z,2,1,4,...,0,0.0,0.0,0.0,0.0,43,-2988,18825,577,3565
4,4,1,41,2,0,True,2022-08-31T18:30:00Z,2,1,5,...,0,0.0,0.0,0.0,0.0,43,-1611,17790,405,2016


In [5]:
# Print all columns
df_currentseason.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out'],
      dtype='object')

In [6]:
#Descriptive information on features
df_currentseason.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3579 non-null   int64  
 1   element            3579 non-null   int64  
 2   fixture            3579 non-null   int64  
 3   opponent_team      3579 non-null   int64  
 4   total_points       3579 non-null   int64  
 5   was_home           3579 non-null   bool   
 6   kickoff_time       3579 non-null   object 
 7   team_h_score       3579 non-null   int64  
 8   team_a_score       3579 non-null   int64  
 9   round              3579 non-null   int64  
 10  minutes            3579 non-null   int64  
 11  goals_scored       3579 non-null   int64  
 12  assists            3579 non-null   int64  
 13  clean_sheets       3579 non-null   int64  
 14  goals_conceded     3579 non-null   int64  
 15  own_goals          3579 non-null   int64  
 16  penalties_saved    3579 

In [7]:
# Check for any missing values.
df_currentseason.isnull().values.any()

False

In [8]:
#Check for duplicates on each row.
df_currentseason.duplicated().value_counts()

False    3579
dtype: int64

In [9]:
#Check for unique values
df_currentseason.nunique()

Unnamed: 0              6
element               624
fixture                60
opponent_team          20
total_points           25
was_home                2
kickoff_time           35
team_h_score            8
team_a_score            4
round                   6
minutes                81
goals_scored            4
assists                 4
clean_sheets            2
goals_conceded         10
own_goals               2
penalties_saved         2
penalties_missed        2
yellow_cards            2
red_cards               2
saves                  10
bonus                   4
bps                    65
influence             269
creativity            368
threat                 88
ict_index             161
value                  69
transfers_balance    2646
selected             3463
transfers_in         2249
transfers_out        2669
dtype: int64

In [10]:
#Descriptive Statistics
df_currentseason.describe()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,team_h_score,team_a_score,round,minutes,goals_scored,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
count,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,...,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0
mean,2.451802,298.993294,30.993015,10.536742,1.37245,1.845488,1.08019,3.548198,33.070131,0.045543,...,6.014809,7.328416,4.911679,5.676167,1.790975,50.579771,0.005867561,233449.0,21220.16,21220.16
std,1.708692,172.640918,17.340367,5.760302,2.497205,1.721169,0.838893,1.708692,40.095865,0.23853,...,9.551605,12.799157,10.650773,13.773295,3.048137,11.746198,91879.12,675331.0,80427.54,68221.26
min,0.0,1.0,1.0,1.0,-4.0,0.0,0.0,1.0,0.0,0.0,...,-15.0,0.0,0.0,0.0,0.0,39.0,-1999266.0,0.0,0.0,0.0
25%,1.0,150.0,16.0,6.0,0.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,45.0,-3769.5,7592.0,35.0,368.5
50%,2.0,299.0,31.0,11.0,0.0,2.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,45.0,-414.0,29989.0,1007.0,2555.0
75%,4.0,448.0,46.0,16.0,2.0,2.0,2.0,5.0,90.0,0.0,...,10.0,10.8,3.0,3.0,2.6,54.0,0.0,131212.0,6846.0,14210.0
max,5.0,624.0,60.0,20.0,22.0,9.0,3.0,6.0,90.0,3.0,...,81.0,125.8,97.3,129.0,30.4,130.0,1332069.0,8026017.0,1384843.0,2041377.0


In [11]:
# Code definition deal with white space in the dataframes
def whitespace_remover(dataframe):
   
    # iterating over the columns
    for i in dataframe.columns:
         
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':
             
            # applying strip function on column
            dataframe[i] = dataframe[i].map(str.strip)
        else:
             
            # if condn. is False then it will do nothing.
            pass

In [12]:
# Extracting overall current season league table text from html_text
url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_2022 = soup.find('table', id='results2022-202391_overall')
# Obtain every title of columns with tag <th>
league_table_2022 = []
for i in table_2022.find_all('td'):
 title = i.text
 league_table_2022.append(title)
league_table_2022 = [league_table_2022[i: i+19] for i in range(0, len(league_table_2022), 19)]
league_table_2022 = pd.DataFrame(league_table_2022, columns = ['Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'xG', 'xGA', 'xGD', 'xGD/90', 'Last 5', 'Attendance', 'Top Team Scorer', 'Goalkeeper', 'Notes'])
league_table_2022['Rank'] = range(1, 1+len(league_table_2022))
league_table_2022.drop(['xG', 'xGA', 'xGD', 'xGD/90',], axis=1, inplace=True)
league_table_2022['year'] = league_table_2022.apply(lambda x: "2022-23", axis=1)
whitespace_remover(league_table_2022)
league_table_2022.head()
#data.text

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,Rank,year
0,Arsenal,7,6,0,1,17,7,10,18,2.57,W W W L W,60070,Gabriel Jesus - 4,Aaron Ramsdale,,1,2022-23
1,Manchester City,7,5,2,0,23,6,17,17,2.43,D W W D W,53325,Erling Haaland - 11,Ederson,,2,2022-23
2,Tottenham,7,5,2,0,18,7,11,17,2.43,W W D W W,61530,Harry Kane - 6,Hugo Lloris,,3,2022-23
3,Brighton,6,4,1,1,11,5,6,13,2.17,D W W L W,31230,Alexis Mac Allister - 4,Robert Sánchez,,4,2022-23
4,Manchester Utd,6,4,0,2,8,8,0,12,2.0,L W W W W,73763,Marcus Rashford - 3,David de Gea,,5,2022-23


In [13]:
# Extracting home and away season league table text from html_text
#url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
#data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_hw_2022 = soup.find('table', id='results2022-202391_home_away')
# Obtain every title of columns with tag <th>
league_home_away_2022 = []
for i in table_hw_2022.find_all('td'):
    title = i.text
    league_home_away_2022.append(title)
league_home_away_2022 = [league_home_away_2022[i: i+27] for i in range(0, len(league_home_away_2022), 27)]
league_home_away_2022 = pd.DataFrame(league_home_away_2022, columns = ['Squad', 'H_MP', 'H_W', 'H_D', 'H_L', 'H_GF', 'H_GA', 'H_GD', 'H_Pts', 'H_Pts/MP', 'H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_MP', 'A_W', 'A_D', 'A_L', 'A_GF', 'A_GA', 'A_GD', 'A_Pts', 'A_Pts/MP', 'A_xG', 'A_xGA', 'A_xGD', 'A_xGD/90'])
league_home_away_2022.drop(['H_xG', 'H_xGA', 'H_xGD', 'H_xGD/90', 'A_xG', 'A_xGA', 'A_xGD', 'A_xGD/90'], axis=1, inplace=True)
league_home_away_2022['year'] = league_home_away_2022.apply(lambda x: "2022-23", axis=1)
whitespace_remover(league_home_away_2022)
league_home_away_2022.head()

Unnamed: 0,Squad,H_MP,H_W,H_D,H_L,H_GF,H_GA,H_GD,H_Pts,H_Pts/MP,A_MP,A_W,A_D,A_L,A_GF,A_GA,A_GD,A_Pts,A_Pts/MP,year
0,Arsenal,3,3,0,0,8,4,4,9,3.0,4,3,0,1,9,3,6,9,2.25,2022-23
1,Manchester City,3,3,0,0,14,2,12,9,3.0,4,2,2,0,9,4,5,8,2.0,2022-23
2,Tottenham,4,4,0,0,13,4,9,12,3.0,3,1,2,0,5,3,2,5,1.67,2022-23
3,Brighton,3,2,1,0,6,2,4,7,2.33,3,2,0,1,5,3,2,6,2.0,2022-23
4,Manchester Utd,3,2,0,1,6,4,2,6,2.0,3,2,0,1,2,4,-2,6,2.0,2022-23


In [14]:
# Convert Attendance feature to numeric datatype
league_table_2022['Attendance'] = pd.to_numeric(league_table_2022['Attendance'].str.replace(',', ''))

In [15]:
# Changing erroneus datatype
convert_dict = {'MP': int, 'W': int, 'D': int, 'L': int, 'GF': int, 'GA': int, 'GD': int, 'Pts': int, 'Pts/MP': float}
convert_dict1 = {'H_MP': int, 'H_W': int, 'H_D': int, 'H_L': int, 'H_GF': int, 'H_GA': int, 'H_GD': int, 'H_Pts': int, 'H_Pts/MP': float, 'A_MP': int, 'A_W': int, 'A_D': int, 'A_L': int, 'A_GF': int, 'A_GA': int, 'A_GD': int, 'A_Pts': int, 'A_Pts/MP': float}

league_table_2022 = league_table_2022.astype(convert_dict)

league_home_away_2022 = league_home_away_2022.astype(convert_dict1)

In [16]:
#Quality check
league_home_away_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Squad     20 non-null     object 
 1   H_MP      20 non-null     int32  
 2   H_W       20 non-null     int32  
 3   H_D       20 non-null     int32  
 4   H_L       20 non-null     int32  
 5   H_GF      20 non-null     int32  
 6   H_GA      20 non-null     int32  
 7   H_GD      20 non-null     int32  
 8   H_Pts     20 non-null     int32  
 9   H_Pts/MP  20 non-null     float64
 10  A_MP      20 non-null     int32  
 11  A_W       20 non-null     int32  
 12  A_D       20 non-null     int32  
 13  A_L       20 non-null     int32  
 14  A_GF      20 non-null     int32  
 15  A_GA      20 non-null     int32  
 16  A_GD      20 non-null     int32  
 17  A_Pts     20 non-null     int32  
 18  A_Pts/MP  20 non-null     float64
 19  year      20 non-null     object 
dtypes: float64(2), int32(16), object(2

### Extract Squad information

In [17]:
# Extracting Squad stats such as age, possession, players used from html_text
#url = "https://fbref.com/en/comps/9/Premier-League-Stats" #current season
#data_2022 = requests.get(url)
soup = BeautifulSoup(data_2022.text)
table_squad_2022 = soup.find('table', id='stats_squads_standard_for')
# Obtain every title of columns with tag <th>
stats_squad_2022 = []
for i in table_squad_2022.find_all('td'):
    title = i.text
    stats_squad_2022.append(title)
stats_squad_2022 = [stats_squad_2022[i: i+28] for i in range(0, len(stats_squad_2022), 28)]
stats_squad_2022 = pd.DataFrame(stats_squad_2022, columns = ['# Pl', 'Age', 'Poss', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls', 'Ast', 'G+A', 'G-PK', 'G+A-PK', 'xG', 'npxG', 'xA', 'npxG+xA', 'xG', 'xA', 'xG+xA', 'npxG', 'npxG+xA'])
stats_squad_2022.drop(['xG', 'npxG', 'xA', 'npxG+xA', 'xG', 'xA', 'xG+xA', 'npxG', 'npxG+xA'], axis=1, inplace=True)
stats_squad_2022['year'] = stats_squad_2022.apply(lambda x: "2022-23", axis=1)
stats_squad_2022.head()

Unnamed: 0,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,year
0,21,24.5,58.6,7,77,630,7.0,16,12,16,0,0,13,0,2.29,1.71,4.0,2.29,4.0,2022-23
1,19,27.0,50.1,7,77,630,7.0,6,4,6,0,0,18,0,0.86,0.57,1.43,0.86,1.43,2022-23
2,23,26.4,35.3,7,77,630,7.0,6,5,6,0,0,14,0,0.86,0.71,1.57,0.86,1.57,2022-23
3,22,26.2,44.3,7,77,630,7.0,15,8,14,1,1,8,0,2.14,1.14,3.29,2.0,3.14,2022-23
4,19,27.6,50.0,6,66,540,6.0,10,5,7,3,3,7,0,1.67,0.83,2.5,1.17,2.0,2022-23


In [18]:
# Obtain every title of columns with tag <th>
headers = []
for i in table_squad_2022.find_all('th'):
 title = i.text
 headers.append(title)
headers

['',
 'Playing Time',
 'Performance',
 'Per 90 Minutes',
 'Expected',
 'Per 90 Minutes',
 'Squad',
 '# Pl',
 'Age',
 'Poss',
 'MP',
 'Starts',
 'Min',
 '90s',
 'Gls',
 'Ast',
 'G-PK',
 'PK',
 'PKatt',
 'CrdY',
 'CrdR',
 'Gls',
 'Ast',
 'G+A',
 'G-PK',
 'G+A-PK',
 'xG',
 'npxG',
 'xA',
 'npxG+xA',
 'xG',
 'xA',
 'xG+xA',
 'npxG',
 'npxG+xA',
 'Arsenal',
 'Aston Villa',
 'Bournemouth',
 'Brentford',
 'Brighton',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Leeds United',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester Utd',
 'Newcastle Utd',
 "Nott'ham Forest",
 'Southampton',
 'Tottenham',
 'West Ham',
 'Wolves']

In [19]:
#Extract Team names for 2022 season
teams = headers[35:]
teams

stats_squad_2022['Squad'] = pd.DataFrame(teams, columns = ['Squad'])

stats_squad_2022.head()

Unnamed: 0,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,...,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,year,Squad
0,21,24.5,58.6,7,77,630,7.0,16,12,16,...,0,13,0,2.29,1.71,4.0,2.29,4.0,2022-23,Arsenal
1,19,27.0,50.1,7,77,630,7.0,6,4,6,...,0,18,0,0.86,0.57,1.43,0.86,1.43,2022-23,Aston Villa
2,23,26.4,35.3,7,77,630,7.0,6,5,6,...,0,14,0,0.86,0.71,1.57,0.86,1.57,2022-23,Bournemouth
3,22,26.2,44.3,7,77,630,7.0,15,8,14,...,1,8,0,2.14,1.14,3.29,2.0,3.14,2022-23,Brentford
4,19,27.6,50.0,6,66,540,6.0,10,5,7,...,3,7,0,1.67,0.83,2.5,1.17,2.0,2022-23,Brighton


#### Observation
- No missing records
- No duplicate observation

In [20]:
#get current season fixtures from FPL API endpoint and create Dataframe

current_season_fixtures = requests.get(fpl_base_url+'fixtures/').json()
fixtures_df = pd.DataFrame(current_season_fixtures)
fixtures_df.head()

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2292871,,False,False,61,,0,False,,8,,1,,[],2,4,74971
1,2292921,,False,False,111,,0,False,,13,,1,,[],5,4,75021
2,2292870,,False,False,62,,0,False,,5,,3,,[],3,2,74972
3,2292882,,False,False,73,,0,False,,7,,5,,[],2,3,74983
4,2292883,,False,False,74,,0,False,,12,,6,,[],4,4,74984


In [21]:
#Print all columns
fixtures_df.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'id',
       'kickoff_time', 'minutes', 'provisional_start_time', 'started',
       'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats',
       'team_h_difficulty', 'team_a_difficulty', 'pulse_id'],
      dtype='object')

In [22]:
# Descriptive information on features.
fixtures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   code                    380 non-null    int64  
 1   event                   366 non-null    float64
 2   finished                380 non-null    bool   
 3   finished_provisional    380 non-null    bool   
 4   id                      380 non-null    int64  
 5   kickoff_time            366 non-null    object 
 6   minutes                 380 non-null    int64  
 7   provisional_start_time  380 non-null    bool   
 8   started                 366 non-null    object 
 9   team_a                  380 non-null    int64  
 10  team_a_score            67 non-null     float64
 11  team_h                  380 non-null    int64  
 12  team_h_score            67 non-null     float64
 13  stats                   380 non-null    object 
 14  team_h_difficulty       380 non-null    in

In [23]:
# Check for any missing values.
fixtures_df.isnull().values.any()

True

In [24]:
True
# Check missing values for each feature.
fixtures_df.isna().sum()

code                        0
event                      14
finished                    0
finished_provisional        0
id                          0
kickoff_time               14
minutes                     0
provisional_start_time      0
started                    14
team_a                      0
team_a_score              313
team_h                      0
team_h_score              313
stats                       0
team_h_difficulty           0
team_a_difficulty           0
pulse_id                    0
dtype: int64

In [25]:
# Descriptive statistics.
fixtures_df.describe()

Unnamed: 0,code,event,id,minutes,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id
count,380.0,366.0,380.0,380.0,380.0,67.0,380.0,67.0,380.0,380.0,380.0
mean,2293000.0,19.956284,190.5,15.868421,10.5,1.149254,10.5,1.80597,2.55,2.95,75100.5
std,109.8408,10.929826,109.840794,34.343193,5.773884,0.925297,5.773884,1.751526,0.805735,1.024824,109.840794
min,2292810.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,2.0,74911.0
25%,2292905.0,11.0,95.75,0.0,5.75,0.0,5.75,1.0,2.0,2.0,75005.75
50%,2293000.0,20.0,190.5,0.0,10.5,1.0,10.5,2.0,2.0,3.0,75100.5
75%,2293094.0,29.0,285.25,0.0,15.25,2.0,15.25,2.0,3.0,4.0,75195.25
max,2293189.0,38.0,380.0,90.0,20.0,3.0,20.0,9.0,5.0,5.0,75290.0


#### Observation
- Some missing records related to future games

### FEATURE ENGINEERING

In [26]:
# Make a copy of the original piece of test data.
df_currentseason_clean = df_currentseason.copy()
players_df_clean = players_df.copy()
fixtures_df_clean = fixtures_df.copy()
league_table_2022_clean = league_table_2022.copy()
league_home_away_2022_clean = league_home_away_2022.copy()
stats_squad_2022_clean = stats_squad_2022.copy()

In [27]:
#Function to compute overall team strength, attack strength and defence strength
def calc_team_strength(players):
    players['team_strength'] = 1000
    players['team_strength'] += (players['W'] / players['MP'])*100
    players['team_strength'] -= (players['D'] / players['MP'])*100
    players['team_strength'] -= (players['L'] / players['MP'])*100
    players['attack_strength'] = 1000 
    players['attack_strength'] += (players['GF'] / players['MP'])*100
    players['defence_strength'] = 1000 
    players['defence_strength'] -= (players['GA'] / players['MP'])*100
    return players

In [28]:
league_table_2022_clean = calc_team_strength(league_table_2022_clean)
league_table_2022_clean.head(20)

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,Rank,year,team_strength,attack_strength,defence_strength
0,Arsenal,7,6,0,1,17,7,10,18,2.57,W W W L W,60070,Gabriel Jesus - 4,Aaron Ramsdale,,1,2022-23,1071.428571,1242.857143,900.0
1,Manchester City,7,5,2,0,23,6,17,17,2.43,D W W D W,53325,Erling Haaland - 11,Ederson,,2,2022-23,1042.857143,1328.571429,914.285714
2,Tottenham,7,5,2,0,18,7,11,17,2.43,W W D W W,61530,Harry Kane - 6,Hugo Lloris,,3,2022-23,1042.857143,1257.142857,900.0
3,Brighton,6,4,1,1,11,5,6,13,2.17,D W W L W,31230,Alexis Mac Allister - 4,Robert Sánchez,,4,2022-23,1033.333333,1183.333333,916.666667
4,Manchester Utd,6,4,0,2,8,8,0,12,2.0,L W W W W,73763,Marcus Rashford - 3,David de Gea,,5,2022-23,1033.333333,1133.333333,866.666667
5,Fulham,7,3,2,2,12,11,1,11,1.57,W L W L W,22076,Aleksandar Mitrović - 6,Bernd Leno,,6,2022-23,985.714286,1171.428571,842.857143
6,Chelsea,6,3,1,2,8,9,-1,10,1.67,D L W L W,39941,Raheem Sterling - 3,Edouard Mendy,,7,2022-23,1000.0,1133.333333,850.0
7,Liverpool,6,2,3,1,15,6,9,9,1.5,D L W W D,53205,"Luis Díaz, Roberto Firmino - 3",Alisson,,8,2022-23,966.666667,1250.0,900.0
8,Brentford,7,2,3,2,15,12,3,9,1.29,L D D W L,17069,Ivan Toney - 5,David Raya,,9,2022-23,957.142857,1214.285714,828.571429
9,Newcastle Utd,7,1,5,1,8,7,1,8,1.14,D D L D D,52151,"Alexander Isak, Callum Wilson - 2",Nick Pope,,10,2022-23,928.571429,1114.285714,900.0


In [29]:
#Function to compute home and away overall strength, attack strength and defence strength
def calc_hw_team_strength(players):
    players['home_team_strength'] = 1000
    players['home_team_strength'] += (players['H_W'] / players['H_MP'])*100
    players['home_team_strength'] -= (players['H_D'] / players['H_MP'])*100
    players['home_team_strength'] -= (players['H_L'] / players['H_MP'])*100
    players['away_team_strength'] = 1000
    players['away_team_strength'] += (players['A_W'] / players['A_MP'])*100
    players['away_team_strength'] -= (players['A_D'] / players['A_MP'])*100
    players['away_team_strength'] -= (players['A_L'] / players['A_MP'])*100
    players['home_attack_strength'] = 1000 
    players['home_attack_strength'] += (players['H_GF'] / players['H_MP'])*100
    players['home_defence_strength'] = 1000 
    players['home_defence_strength'] -= (players['H_GA'] / players['H_MP'])*100
    players['away_attack_strength'] = 1000 
    players['away_attack_strength'] += (players['A_GF'] / players['A_MP'])*100
    players['away_defence_strength'] = 1000 
    players['away_defence_strength'] -= (players['A_GA'] / players['A_MP'])*100
    return players

In [30]:
league_home_away_2022_clean = calc_hw_team_strength(league_home_away_2022_clean)
league_home_away_2022_clean.head(20)

Unnamed: 0,Squad,H_MP,H_W,H_D,H_L,H_GF,H_GA,H_GD,H_Pts,H_Pts/MP,...,A_GD,A_Pts,A_Pts/MP,year,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength
0,Arsenal,3,3,0,0,8,4,4,9,3.0,...,6,9,2.25,2022-23,1100.0,1050.0,1266.666667,866.666667,1225.0,925.0
1,Manchester City,3,3,0,0,14,2,12,9,3.0,...,5,8,2.0,2022-23,1100.0,1000.0,1466.666667,933.333333,1225.0,900.0
2,Tottenham,4,4,0,0,13,4,9,12,3.0,...,2,5,1.67,2022-23,1100.0,966.666667,1325.0,900.0,1166.666667,900.0
3,Brighton,3,2,1,0,6,2,4,7,2.33,...,2,6,2.0,2022-23,1033.333333,1033.333333,1200.0,933.333333,1166.666667,900.0
4,Manchester Utd,3,2,0,1,6,4,2,6,2.0,...,-2,6,2.0,2022-23,1033.333333,1033.333333,1200.0,866.666667,1066.666667,866.666667
5,Fulham,3,2,1,0,7,5,2,7,2.33,...,-1,4,1.0,2022-23,1033.333333,950.0,1233.333333,833.333333,1125.0,850.0
6,Chelsea,3,2,1,0,6,4,2,7,2.33,...,-3,3,1.0,2022-23,1033.333333,966.666667,1200.0,866.666667,1066.666667,833.333333
7,Liverpool,3,2,1,0,12,2,10,7,2.33,...,-1,2,0.67,2022-23,1033.333333,900.0,1400.0,933.333333,1100.0,866.666667
8,Brentford,4,2,1,1,10,6,4,7,1.75,...,-1,2,0.67,2022-23,1000.0,900.0,1250.0,850.0,1166.666667,800.0
9,Newcastle Utd,4,1,3,0,6,4,2,6,1.5,...,-1,2,0.67,2022-23,950.0,900.0,1150.0,900.0,1066.666667,900.0


In [31]:
#Code to save the scraped data to csvs
#league_table_2022_clean.to_csv('/home/laniolao/fpl/FantasyPremierLeague/league_2022-2023_standings.csv')
#league_home_away_2022_clean.to_csv('/home/laniolao/fpl/FantasyPremierLeague/league_2022-2023_standings_home_away.csv')

To engineer two new features named club_name and position, we create dictionaries with the teams dataframe and map "team" and "element_type" to engineer the features.

In [32]:
#Map the team names and the player positions into the players_df_clean dataframe
teams_now=dict(zip(teams_df.id, teams_df.short_name))
positions=dict(zip(element_types_df.id, element_types_df.singular_name_short))
players_df_clean['club_name'] = players_df_clean['team'].map(teams_now)
players_df_clean['position'] = players_df_clean['element_type'].map(positions)

In [33]:
#Update the club names from abbreviations to full names
players_df_clean["club_name"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
players_df_clean.club_name

0      Arsenal
1      Arsenal
2      Arsenal
3      Arsenal
4      Arsenal
        ...   
626     Wolves
627     Wolves
628     Wolves
629     Wolves
630     Wolves
Name: club_name, Length: 631, dtype: object

In [34]:
#create the player name feature
players_df_clean['name'] = players_df_clean['first_name'] + ' ' + players_df_clean['second_name']

In [35]:
#Create season_x feature to align with the train data
df_currentseason_clean['season_x'] = df_currentseason_clean.apply(lambda x: "2022-23", axis=1)

In [36]:
df_currentseason_clean.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x'],
      dtype='object')

In [37]:
players_df_clean[['id', 'first_name', 'second_name', 'name','club_name', 'minutes', 'form',  'bonus', 'bps', 'total_points', 'value_season', 'value_form']].head()

Unnamed: 0,id,first_name,second_name,name,club_name,minutes,form,bonus,bps,total_points,value_season,value_form
0,1,Cédric,Alves Soares,Cédric Alves Soares,Arsenal,0,0.0,0,0,0,0.0,0.0
1,3,Granit,Xhaka,Granit Xhaka,Arsenal,627,3.0,2,137,31,6.2,0.6
2,4,Mohamed,Elneny,Mohamed Elneny,Arsenal,90,0.0,0,15,2,0.5,0.0
3,5,Rob,Holding,Rob Holding,Arsenal,3,0.3,0,8,2,0.5,0.1
4,6,Thomas,Partey,Thomas Partey,Arsenal,347,1.0,0,52,11,2.3,0.2


We have dataframes with all the players in the league in the current season (players_df) and the current season individual players performance (df_currentseason_clean). We proceed as described below:
1. Map team strength features from the league standing dataframes to players_df
2. Map the team names, player names and form into the all current season data player dataframe
3. Drop irrelevant column
4. Drop players that have not played any game this season
5. Rename the column 'round' to 'GW' to be similar with the train data set
6. Engineer column 'game_date' feature and format to appropriate dtype
7. Engineer game season weather feature.
8. Engineer feature to highlights early and late games based on start time
9. Engineer feature to highlight the game year only.



In [38]:
#Map the team names, player names and form into the all current season data player dataframe
teams_map=dict(zip(players_df_clean.id, players_df_clean.name))
club_map=dict(zip(players_df_clean.id, players_df_clean.club_name))
opp_teams_map=dict(zip(players_df_clean.team, players_df_clean.club_name))
form_map=dict(zip(players_df_clean.id, players_df_clean.form))
position_map=dict(zip(players_df_clean.id, players_df_clean.position))
league_table_2022_clean['squad_season'] = league_table_2022_clean['Squad'] + '_' + league_table_2022_clean['year']
league_home_away_2022_clean['squad_season'] = league_home_away_2022_clean['Squad'] + '_' + league_home_away_2022_clean['year']
stats_squad_2022_clean['squad_season'] = stats_squad_2022_clean['Squad'] + '_' + stats_squad_2022_clean['year']
df_currentseason_clean['name'] = df_currentseason_clean['element'].map(teams_map)
df_currentseason_clean['club_name'] = df_currentseason_clean['element'].map(club_map)
df_currentseason_clean['opp_team_name'] = df_currentseason_clean['opponent_team'].map(opp_teams_map)
df_currentseason_clean['form'] = df_currentseason_clean['element'].map(form_map)
df_currentseason_clean['position'] = df_currentseason_clean['element'].map(position_map)
df_currentseason_clean['team_season'] = df_currentseason_clean['club_name'] + '_' + df_currentseason_clean['season_x']


In [39]:
# Verify engineered features
df_currentseason_clean[['name', 'club_name', 'opp_team_name', 'form', 'position', 'team_season', 'minutes']].head(50)

Unnamed: 0,name,club_name,opp_team_name,form,position,team_season,minutes
0,Cédric Alves Soares,Arsenal,Crystal Palace,0.0,DEF,Arsenal_2022-23,0
1,Cédric Alves Soares,Arsenal,Leicester City,0.0,DEF,Arsenal_2022-23,0
2,Cédric Alves Soares,Arsenal,Bournemouth,0.0,DEF,Arsenal_2022-23,0
3,Cédric Alves Soares,Arsenal,Fulham,0.0,DEF,Arsenal_2022-23,0
4,Cédric Alves Soares,Arsenal,Aston Villa,0.0,DEF,Arsenal_2022-23,0
5,Cédric Alves Soares,Arsenal,Manchester Utd,0.0,DEF,Arsenal_2022-23,0
6,Granit Xhaka,Arsenal,Crystal Palace,3.0,MID,Arsenal_2022-23,90
7,Granit Xhaka,Arsenal,Leicester City,3.0,MID,Arsenal_2022-23,90
8,Granit Xhaka,Arsenal,Bournemouth,3.0,MID,Arsenal_2022-23,87
9,Granit Xhaka,Arsenal,Fulham,3.0,MID,Arsenal_2022-23,90


In [40]:
# Map the overall strength and defence and attack strength of each team for respective season.
teamstrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.team_strength))
attackstrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.attack_strength))
defencestrength=dict(zip(league_table_2022_clean.squad_season, league_table_2022_clean.defence_strength))

df_currentseason_clean['team_strength'] = df_currentseason_clean['team_season'].map(teamstrength)
df_currentseason_clean['attack_strength'] = df_currentseason_clean['team_season'].map(attackstrength)
df_currentseason_clean['defence_strength'] = df_currentseason_clean['team_season'].map(defencestrength)

In [41]:
# Map the home and away overall strength and defence and attack strength of each team for respective season.
hometeamstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_team_strength))
awayteamstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_team_strength))
homeattackstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_attack_strength))
homedefencestrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.home_defence_strength))
awayattackstrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_attack_strength))
awaydefencestrength=dict(zip(league_home_away_2022_clean.squad_season, league_home_away_2022_clean.away_defence_strength))


df_currentseason_clean['home_team_strength'] = df_currentseason_clean['team_season'].map(hometeamstrength)
df_currentseason_clean['away_team_strength'] = df_currentseason_clean['team_season'].map(awayteamstrength)
df_currentseason_clean['home_attack_strength'] = df_currentseason_clean['team_season'].map(homeattackstrength)
df_currentseason_clean['home_defence_strength'] = df_currentseason_clean['team_season'].map(homedefencestrength)
df_currentseason_clean['away_attack_strength'] = df_currentseason_clean['team_season'].map(awayattackstrength)
df_currentseason_clean['away_defence_strength'] = df_currentseason_clean['team_season'].map(awaydefencestrength)

In [42]:
# Map the squad average age of each team for respective season.
averageage=dict(zip(stats_squad_2022_clean.squad_season, stats_squad_2022_clean.Age))


df_currentseason_clean['squad_average_age'] = df_currentseason_clean['team_season'].map(averageage)

In [43]:
#Drop irrelevant feature
df_currentseason_clean.drop(['Unnamed: 0'], axis=1, inplace=True)

In [44]:
# Drop players that have not played any game this season from the df_currentseason dataframe
play_zero_minutes = df_currentseason_clean[df_currentseason_clean.minutes == 0].index
df_currentseason_clean.drop(play_zero_minutes, axis = 0, inplace=True)

In [45]:
#Rename column 'round' to 'GW' to be similar to train data
df_currentseason_clean.rename(columns= { 'round': 'GW' }, inplace=True)

In [46]:
#Preview kickoff_time column
df_currentseason_clean.kickoff_time

6       2022-08-05T19:00:00Z
7       2022-08-13T14:00:00Z
8       2022-08-20T16:30:00Z
9       2022-08-27T16:30:00Z
10      2022-08-31T18:30:00Z
                ...         
3574    2022-08-20T11:30:00Z
3575    2022-08-28T13:00:00Z
3576    2022-08-31T18:30:00Z
3577    2022-09-03T14:00:00Z
3578    2022-09-03T14:00:00Z
Name: kickoff_time, Length: 1768, dtype: object

In [47]:
# Feature Engineer column 'game_date' and format to appropriate dtype
df_currentseason_clean['game_date'] = df_currentseason_clean['kickoff_time'].str.replace('T', ' ')
df_currentseason_clean['game_date'] = df_currentseason_clean['game_date'].str.replace(':00Z', '')
df_currentseason_clean['game_date'] = pd.to_datetime(df_currentseason_clean['game_date'])
df_currentseason_clean.game_date

6      2022-08-05 19:00:00
7      2022-08-13 14:00:00
8      2022-08-20 16:30:00
9      2022-08-27 16:30:00
10     2022-08-31 18:30:00
               ...        
3574   2022-08-20 11:30:00
3575   2022-08-28 13:00:00
3576   2022-08-31 18:30:00
3577   2022-09-03 14:00:00
3578   2022-09-03 14:00:00
Name: game_date, Length: 1768, dtype: datetime64[ns]

In [48]:
# Engineer game season weather feature.
seasons_curr = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_curr_season = dict(zip(range(1,13), seasons_curr))
df_currentseason_clean['game_weather'] = df_currentseason_clean.game_date.dt.month.map(month_to_curr_season) 

In [49]:
# Data Quality Check
df_currentseason_clean.game_weather.value_counts()

3    1446
4     322
Name: game_weather, dtype: int64

In [50]:
# Engineer feature to highlights games that started before 13:00 (early starts) and those that started after 13:00 (late
df_currentseason_clean['start_label'] = np.where((df_currentseason_clean['game_date'].dt.hour) < 13, 0, 1)

In [51]:
# Quality Check
df_currentseason_clean.start_label.value_counts()

1    1620
0     148
Name: start_label, dtype: int64

In [52]:
df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 6 to 3578
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   element                1768 non-null   int64         
 1   fixture                1768 non-null   int64         
 2   opponent_team          1768 non-null   int64         
 3   total_points           1768 non-null   int64         
 4   was_home               1768 non-null   bool          
 5   kickoff_time           1768 non-null   object        
 6   team_h_score           1768 non-null   int64         
 7   team_a_score           1768 non-null   int64         
 8   GW                     1768 non-null   int64         
 9   minutes                1768 non-null   int64         
 10  goals_scored           1768 non-null   int64         
 11  assists                1768 non-null   int64         
 12  clean_sheets           1768 non-null   int64         
 13  goa

Data Quality
Data quality issues are mostly divided into four:

- Completeness: do we have all of the records that we should? Do we have missing records or not? Are there specific rows, columns, or cells missing?
- Validity: we have the records, but they're not valid, i.e., they don't conform to a defined schema. A schema is a defined set of rules for data. These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables).
- Accuracy: inaccurate data is wrong data that is valid. It adheres to the defined schema, but it is still incorrect.
- Consistency: inconsistent data is both valid and accurate, but there are multiple correct ways of referring to the same thing. Consistency, i.e., a standard format, in columns that represent the same data across tables and/or within tables is desired.

After assessing the data, we have the following issues:

1. Erroneous data types in the following data frames
- df_currentseason dataframe (form).
- fixtures_df_clean (kickoff_time)
2. Redundant features in all the following data frames
- df_currentseason dataframe (game_date, season_x, opponent_team, fixture, kickoff_time, element and name)
- players_df dataframe (id, squad_number)
- fixtures_df (stats, id)
3. Null values in the following data frames
- players_df dataframe (chance_of_playing_next_round, chance_of_playing_this_round, corners_and_indirect_freekicks_order, direct_freekicks_order, penalties_order)
- fixtures_df_clean (team_a_score, team_h_score)
## Data Tidiness
There are three main requirements for tidiness.

1. Each variable forms a column,
2. Each observation forms a row, and
3. Each type of observational unit forms a table.
The three above criteria's are fairly met by the dataset.

### CLEANING DATA

Issue #1:
- Erroneous data types in respective data frames

Define
- Change form feature to appropriate data type

Code

In [53]:
#Chnage 'form' feature dtype
df_currentseason_clean.form = df_currentseason_clean.form.astype(float)

fixtures_df_clean['kickoff_time'] = pd.to_datetime(fixtures_df_clean['kickoff_time'])

Issue #2:
- Redundant features in respective data frames

Define
- Create player_details data frame to save the name and total point features for prediction validation purpose
- Drop all identified redundant features in respective data frames

Code

In [54]:
#Drop features no longer needed in the dataframe
GWfixtures_df = fixtures_df_clean.loc[(fixtures_df_clean['event'] == 9)].copy()
player_details = df_currentseason_clean[['name', 'total_points']].loc[(df_currentseason_clean['GW']==9)]
df_currentseason_clean.drop(['season_x', 'opponent_team', 'fixture', 'kickoff_time', 'element', 'name'], axis=1, inplace=True)
players_df_clean.drop(['id', 'squad_number'], axis=1, inplace=True)
fixtures_df_clean.drop(['stats', 'id'], axis=1, inplace=True)

Issue #3:
- Null values in respective data frames

Define
- Fill all identified features in respective data frames

Code

In [55]:
#Cleaning the current season dataframes (null values, time series operations for dates)
players_df_clean.chance_of_playing_next_round = players_df_clean.chance_of_playing_next_round.fillna(100.0)
players_df_clean.chance_of_playing_this_round = players_df_clean.chance_of_playing_this_round.fillna(100.0)
players_df_clean.corners_and_indirect_freekicks_order = players_df_clean.corners_and_indirect_freekicks_order.fillna(0)
players_df_clean.direct_freekicks_order = players_df_clean.direct_freekicks_order.fillna(0)
players_df_clean.penalties_order = players_df_clean.penalties_order.fillna(0)
fixtures_df_clean.team_a_score = fixtures_df_clean.team_a_score.fillna(0)
fixtures_df_clean.team_h_score = fixtures_df_clean.team_h_score.fillna(0)

In [56]:
#Assign year as index
df_currentseason_clean.set_index('game_date', inplace=True)

df_currentseason_clean

Unnamed: 0_level_0,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,goals_conceded,...,defence_strength,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,squad_average_age,game_weather,start_label
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-08-05 19:00:00,2,False,0,2,1,90,0,0,1,0,...,900.0,1100.0,1050.0,1266.666667,866.666667,1225.000000,925.0,24.5,3,1
2022-08-13 14:00:00,12,True,4,2,2,90,1,1,0,2,...,900.0,1100.0,1050.0,1266.666667,866.666667,1225.000000,925.0,24.5,3,1
2022-08-20 16:30:00,6,False,0,3,3,87,0,1,1,0,...,900.0,1100.0,1050.0,1266.666667,866.666667,1225.000000,925.0,24.5,3,1
2022-08-27 16:30:00,2,True,2,1,4,90,0,0,0,1,...,900.0,1100.0,1050.0,1266.666667,866.666667,1225.000000,925.0,24.5,3,1
2022-08-31 18:30:00,2,True,2,1,5,90,0,0,0,1,...,900.0,1100.0,1050.0,1266.666667,866.666667,1225.000000,925.0,24.5,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-20 11:30:00,2,False,1,0,3,90,0,0,0,1,...,900.0,950.0,900.0,1050.000000,900.000000,1033.333333,900.0,26.2,3,0
2022-08-28 13:00:00,2,True,1,1,4,90,0,0,0,1,...,900.0,950.0,900.0,1050.000000,900.000000,1033.333333,900.0,26.2,3,1
2022-08-31 18:30:00,3,False,0,0,5,74,0,0,1,0,...,900.0,950.0,900.0,1050.000000,900.000000,1033.333333,900.0,26.2,3,1
2022-09-03 14:00:00,9,True,1,0,6,90,0,1,1,0,...,900.0,950.0,900.0,1050.000000,900.000000,1033.333333,900.0,26.2,4,1


In [57]:
df_test = df_currentseason_clean.drop(['bps', 'total_points', 'transfers_balance', 'team_season', 'team_a_score', 'team_h_score', 'selected', 'value'],  axis = 1)

In [58]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1768 entries, 2022-08-05 19:00:00 to 2022-09-03 14:00:00
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   was_home               1768 non-null   bool   
 1   GW                     1768 non-null   int64  
 2   minutes                1768 non-null   int64  
 3   goals_scored           1768 non-null   int64  
 4   assists                1768 non-null   int64  
 5   clean_sheets           1768 non-null   int64  
 6   goals_conceded         1768 non-null   int64  
 7   own_goals              1768 non-null   int64  
 8   penalties_saved        1768 non-null   int64  
 9   penalties_missed       1768 non-null   int64  
 10  yellow_cards           1768 non-null   int64  
 11  red_cards              1768 non-null   int64  
 12  saves                  1768 non-null   int64  
 13  bonus                  1768 non-null   int64  
 14  influence           

In [59]:
# Convert dataframe to a dictionary.
df_test_dict = df_test.to_dict(orient='records')

In [60]:
with open('./model/dv', 'rb') as f_in1:
    dv = pickle.load(f_in1)

In [61]:
# Transform data.
test_encoded = dv.transform(df_test_dict)

In [62]:
# vocabulary
vocab = dv.vocabulary_

# show vocab
vocab

{'position=GK': 86,
 'assists': 1,
 'bonus': 6,
 'clean_sheets': 7,
 'creativity': 38,
 'goals_conceded': 42,
 'goals_scored': 43,
 'ict_index': 47,
 'influence': 48,
 'minutes': 49,
 'opp_team_name=Leicester': 64,
 'own_goals': 81,
 'penalties_missed': 82,
 'penalties_saved': 83,
 'red_cards': 88,
 'saves': 89,
 'threat': 132,
 'transfers_in': 133,
 'transfers_out': 134,
 'was_home': 135,
 'yellow_cards': 136,
 'GW': 0,
 'club_name=Hull City': 20,
 'form': 40,
 'team_strength': 131,
 'attack_strength': 2,
 'defence_strength': 39,
 'home_team_strength': 46,
 'away_team_strength': 5,
 'home_attack_strength': 44,
 'home_defence_strength': 45,
 'away_attack_strength': 3,
 'away_defence_strength': 4,
 'squad_average_age=27.4': 114,
 'game_weather': 41,
 'start_label': 130,
 'position=DEF': 84,
 'opp_team_name=Hull': 62,
 'club_name=Leicester City': 22,
 'squad_average_age=27.8': 118,
 'position=MID': 87,
 'club_name=West Brom': 35,
 'squad_average_age=29.4': 129,
 'position=FWD': 85,
 'clu

In [63]:
# Convert array returned from dictvectorizer to a dataframe.
test_transformed = pd.DataFrame(test_encoded, columns=dv.feature_names_)

test_transformed.head()

Unnamed: 0,GW,assists,attack_strength,away_attack_strength,away_defence_strength,away_team_strength,bonus,clean_sheets,club_name=Arsenal,club_name=Aston Villa,...,squad_average_age=29.0,squad_average_age=29.1,squad_average_age=29.4,start_label,team_strength,threat,transfers_in,transfers_out,was_home,yellow_cards
0,1.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1071.428571,2.0,0.0,0.0,0.0,1.0
1,2.0,1.0,1242.857143,1225.0,925.0,1050.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1071.428571,28.0,9001.0,9630.0,1.0,0.0
2,3.0,1.0,1242.857143,1225.0,925.0,1050.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1071.428571,6.0,137326.0,25286.0,0.0,0.0
3,4.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1071.428571,12.0,77459.0,34699.0,1.0,0.0
4,5.0,0.0,1242.857143,1225.0,925.0,1050.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1071.428571,8.0,49435.0,38654.0,1.0,0.0


In [64]:
# Check the shape of the dataframe.
test_transformed.shape

# Read in scaler.
with open('./model/min_max_scaler', 'rb') as f_in2:
    scaler = pickle.load(f_in2)

In [65]:
# Transform test data.
test_norm = scaler.transform(test_transformed)

In [66]:
# Read in model.
with open('./model/rf_model.pkl', 'rb') as f_in3:
    model = pickle.load(f_in3)

In [67]:
# Utility function
def evaluate_model(model, x, y):
    """
    Utility function to print the model performance, (RMSE and R-Squared scores)
    model: Fitted model
    x: cross validation features dataset
    y: cross validation target values
    """
    predicted = model.predict(x) #get predictions
    RSME_score = mean_squared_error(y_true=y, y_pred=predicted, squared=False) #squared=False will RMSE instead of MSE
    R2_score = r2_score(y, predicted)
    
    print('RMSE:', RSME_score)
    print('R-Squared:', R2_score)
    print()

In [68]:
evaluate_model(model, test_norm, df_currentseason_clean['total_points'])

RMSE: 0.4641378910415012
R-Squared: 0.9752795823511894



In [69]:

df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1768 entries, 2022-08-05 19:00:00 to 2022-09-03 14:00:00
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_points           1768 non-null   int64  
 1   was_home               1768 non-null   bool   
 2   team_h_score           1768 non-null   int64  
 3   team_a_score           1768 non-null   int64  
 4   GW                     1768 non-null   int64  
 5   minutes                1768 non-null   int64  
 6   goals_scored           1768 non-null   int64  
 7   assists                1768 non-null   int64  
 8   clean_sheets           1768 non-null   int64  
 9   goals_conceded         1768 non-null   int64  
 10  own_goals              1768 non-null   int64  
 11  penalties_saved        1768 non-null   int64  
 12  penalties_missed       1768 non-null   int64  
 13  yellow_cards           1768 non-null   int64  
 14  red_cards           

In [70]:
fixtures_df_clean.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'kickoff_time',
       'minutes', 'provisional_start_time', 'started', 'team_a',
       'team_a_score', 'team_h', 'team_h_score', 'team_h_difficulty',
       'team_a_difficulty', 'pulse_id'],
      dtype='object')

In [71]:
#Create Next Game week fixtures
GWfixtures_df = fixtures_df_clean.loc[(fixtures_df_clean['event'] == 9)].copy()
GWfixtures_df.shape

(10, 15)

##To do

Code above needs to be updated to use date time to identify the next game week fixtures as opposed to current use of gameweek event

In [72]:
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id
81,2292891,9.0,False,False,2022-10-01 11:30:00+00:00,0,False,False,18,0.0,1,0.0,3,4,74991
82,2292890,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,4,0.0,3,0.0,2,2,74992
83,2292892,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,6,0.0,7,0.0,3,3,74993
84,2292893,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,15,0.0,9,0.0,3,2,74994
85,2292896,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,5,0.0,12,0.0,3,5,74997
86,2292898,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,8,0.0,17,0.0,2,2,74999
87,2292899,9.0,False,False,2022-10-01 16:30:00+00:00,0,False,False,20,0.0,19,0.0,2,3,75000
88,2292897,9.0,False,False,2022-10-02 13:00:00+00:00,0,False,False,14,0.0,13,0.0,3,5,74998
89,2292894,9.0,False,False,2022-10-02 15:30:00+00:00,0,False,False,2,0.0,11,0.0,2,2,74995
90,2292895,9.0,False,False,2022-10-03 19:00:00+00:00,0,False,False,16,0.0,10,0.0,2,2,74996


In [73]:
#Mapping away and home teams using dictionary created earlier
GWfixtures_df['away_team'] = GWfixtures_df['team_a'].map(teams_now)
GWfixtures_df['home_team'] = GWfixtures_df['team_h'].map(teams_now)
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team
81,2292891,9.0,False,False,2022-10-01 11:30:00+00:00,0,False,False,18,0.0,1,0.0,3,4,74991,TOT,ARS
82,2292890,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,4,0.0,3,0.0,2,2,74992,BRE,BOU
83,2292892,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,6,0.0,7,0.0,3,3,74993,CHE,CRY
84,2292893,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,15,0.0,9,0.0,3,2,74994,NEW,FUL
85,2292896,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,5,0.0,12,0.0,3,5,74997,BHA,LIV
86,2292898,9.0,False,False,2022-10-01 14:00:00+00:00,0,False,False,8,0.0,17,0.0,2,2,74999,EVE,SOU
87,2292899,9.0,False,False,2022-10-01 16:30:00+00:00,0,False,False,20,0.0,19,0.0,2,3,75000,WOL,WHU
88,2292897,9.0,False,False,2022-10-02 13:00:00+00:00,0,False,False,14,0.0,13,0.0,3,5,74998,MUN,MCI
89,2292894,9.0,False,False,2022-10-02 15:30:00+00:00,0,False,False,2,0.0,11,0.0,2,2,74995,AVL,LEE
90,2292895,9.0,False,False,2022-10-03 19:00:00+00:00,0,False,False,16,0.0,10,0.0,2,2,74996,NFO,LEI


In [74]:
#Update the away team name from abbreviations to full names
GWfixtures_df["away_team"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
GWfixtures_df.away_team

81          Tottenham
82          Brentford
83            Chelsea
84      Newcastle Utd
85           Brighton
86            Everton
87             Wolves
88     Manchester Utd
89        Aston Villa
90    Nott'ham Forest
Name: away_team, dtype: object

In [75]:
#Update the home team name from abbreviations to full names
GWfixtures_df["home_team"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester City', 'LEE': 'Leeds United', 'LIV': 'Liverpool', 'MCI': 'Manchester City', 'MUN': 'Manchester Utd', 'NEW': 'Newcastle Utd', 'NFO': "Nott'ham Forest", 'SOU': 'Southampton', 'TOT': 'Tottenham',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
GWfixtures_df.home_team

81            Arsenal
82        Bournemouth
83     Crystal Palace
84             Fulham
85          Liverpool
86        Southampton
87           West Ham
88    Manchester City
89       Leeds United
90     Leicester City
Name: home_team, dtype: object

In [76]:
#Create features for data analysis (player-opposition team, game week fixture difficulty index, player's club )
gw_away_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_a"])
gw_away_players['player_opp'] = gw_away_players['web_name'].map(str) + '-' + gw_away_players['home_team'].map(str)
gw_home_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_h"])
gw_home_players['player_opp'] = gw_home_players['web_name'].map(str) + '-' + gw_home_players['away_team'].map(str)
players_df_clean = gw_away_players.append(gw_home_players)
players_df_clean['player_club'] = players_df_clean['web_name'].map(str) + '-' + players_df_clean['club_name'].map(str)
players_df_clean.drop(['minutes_y'], axis=1, inplace=True)
players_df_clean.rename(columns = {'minutes_x':'minutes'}, inplace = True)
players_df_clean.columns

  players_df_clean = gw_away_players.append(gw_home_players)


Index(['chance_of_playing_next_round', 'chance_of_playing_this_round',
       'code_x', 'cost_change_event', 'cost_change_event_fall',
       'cost_change_start', 'cost_change_start_fall', 'dreamteam_count',
       'element_type', 'ep_next', 'ep_this', 'event_points', 'first_name',
       'form', 'in_dreamteam', 'news', 'news_added', 'now_cost', 'photo',
       'points_per_game', 'second_name', 'selected_by_percent', 'special',
       'status', 'team', 'team_code', 'total_points', 'transfers_in',
       'transfers_in_event', 'transfers_out', 'transfers_out_event',
       'value_form', 'value_season', 'web_name', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'influence_rank', 'influence_rank_type', 'creativity_rank',
       'creativity_rank_type', 'threat_rank', 'threat_rank_type'

In [77]:
# Map the games played so far by each team for respective season to use for averaging
matches_played_map=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.MP))

players_df_clean['MP'] = players_df_clean['club_name'].map(matches_played_map)

In [78]:
#Updating `players_df_clean` with all features necessary for prediction

players_df_clean["position"].replace({'GKP': 'GK'}, inplace=True)
players_df_clean["assist_ave"] = players_df_clean['assists'] / players_df_clean['MP']
players_df_clean["bonus_ave"] = players_df_clean['bonus'] / players_df_clean['MP']
players_df_clean["clean_sheets_ave"] = players_df_clean['clean_sheets'] / players_df_clean['MP']
players_df_clean["goals_conceded_ave"] = players_df_clean['goals_conceded'] / players_df_clean['MP']
players_df_clean["goals_scored_ave"] = players_df_clean['goals_scored'] / players_df_clean['MP']
players_df_clean["minutes_ave"] = players_df_clean['minutes'] / players_df_clean['MP']
players_df_clean['opp_team_name'] = players_df_clean.apply(lambda x: x['home_team'] if x['home_team'] != x['club_name'] else x['away_team'], axis=1)
players_df_clean["own_goals_ave"] = players_df_clean['own_goals'] / players_df_clean['MP']
players_df_clean["penalties_missed_ave"] = players_df_clean['penalties_missed'] / players_df_clean['MP']
players_df_clean["penalties_saved_ave"] = players_df_clean['penalties_saved'] / players_df_clean['MP']
players_df_clean["red_cards_ave"] = players_df_clean['red_cards'] / players_df_clean['MP']
players_df_clean["saves_ave"] = players_df_clean['saves'] / players_df_clean['MP']
players_df_clean["yellow_cards_ave"] = players_df_clean['yellow_cards'] / players_df_clean['MP']
players_df_clean['was_home'] = players_df_clean.apply(lambda x: 'True' if x['home_team'] == x['club_name'] else 'False', axis=1)
players_df_clean.rename(columns = {'event':'GW'}, inplace = True)


In [79]:
players_df_clean.kickoff_time.value_counts()

2022-10-01 14:00:00+00:00    317
2022-10-03 19:00:00+00:00     68
2022-10-02 15:30:00+00:00     64
2022-10-01 11:30:00+00:00     63
2022-10-02 13:00:00+00:00     62
2022-10-01 16:30:00+00:00     57
Name: kickoff_time, dtype: int64

In [80]:
# Engineer feature to highlights games that started before 13:00 (early starts) and those that started after 13:00 (late starts)
players_df_clean['start_label'] = np.where((players_df_clean['kickoff_time'].dt.hour) < 13, 0, 1)

In [81]:
# Quality Check.
players_df_clean[['kickoff_time', 'start_label']].value_counts()

kickoff_time               start_label
2022-10-01 14:00:00+00:00  1              317
2022-10-03 19:00:00+00:00  1               68
2022-10-02 15:30:00+00:00  1               64
2022-10-01 11:30:00+00:00  0               63
2022-10-02 13:00:00+00:00  1               62
2022-10-01 16:30:00+00:00  1               57
dtype: int64

In [82]:
# Engineer game season weather feature.
seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_season = dict(zip(range(1,13), seasons))
players_df_clean['game_weather'] = players_df_clean.kickoff_time.dt.month.map(month_to_season) 

In [83]:
# Data Quality Check.
players_df_clean.game_weather.value_counts()

4    631
Name: game_weather, dtype: int64

In [84]:
# Map the overall strength and defence and attack strength of each team for respective season.
teamstrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.team_strength))
attackstrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.attack_strength))
defencestrength1=dict(zip(league_table_2022_clean.Squad, league_table_2022_clean.defence_strength))

players_df_clean['team_strength'] = players_df_clean['club_name'].map(teamstrength1)
players_df_clean['attack_strength'] = players_df_clean['club_name'].map(attackstrength1)
players_df_clean['defence_strength'] = players_df_clean['club_name'].map(defencestrength1)

In [85]:
# Map the home and away overall strength and defence and attack strength of each team for respective season.
hometeamstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_team_strength))
awayteamstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_team_strength))
homeattackstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_attack_strength))
homedefencestrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.home_defence_strength))
awayattackstrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_attack_strength))
awaydefencestrength1=dict(zip(league_home_away_2022_clean.Squad, league_home_away_2022_clean.away_defence_strength))


players_df_clean['home_team_strength'] = players_df_clean['club_name'].map(hometeamstrength1)
players_df_clean['away_team_strength'] = players_df_clean['club_name'].map(awayteamstrength1)
players_df_clean['home_attack_strength'] = players_df_clean['club_name'].map(homeattackstrength1)
players_df_clean['home_defence_strength'] = players_df_clean['club_name'].map(homedefencestrength1)
players_df_clean['away_attack_strength'] = players_df_clean['club_name'].map(awayattackstrength1)
players_df_clean['away_defence_strength'] = players_df_clean['club_name'].map(awaydefencestrength1)

## To consider 

We need to figure out how to engineer each clubs performance against different opponents historically

In [86]:
# Map the squad average age of each team for respective season.
averageage1=dict(zip(stats_squad_2022_clean.Squad, stats_squad_2022_clean.Age))


players_df_clean['squad_average_age'] = players_df_clean['club_name'].map(averageage1)

In [87]:
#Quality Check
players_df_clean[['name', 'MP','club_name', 'squad_average_age', 'away_team_strength', 'away_attack_strength', 'away_defence_strength', 'bonus']].sort_values(by='bonus', ascending=False).head(20)

Unnamed: 0,name,MP,club_name,squad_average_age,away_team_strength,away_attack_strength,away_defence_strength,bonus
236,Erling Haaland,7,Manchester City,27.5,1000.0,1225.0,900.0,12
264,Harry Kane,7,Tottenham,27.7,966.666667,1166.666667,900.0,10
23,William Saliba,7,Arsenal,24.5,1050.0,1225.0,925.0,9
75,Alexis Mac Allister,6,Brighton,27.6,1033.333333,1166.666667,900.0,9
34,Ivan Toney,7,Brentford,26.2,900.0,1166.666667,800.0,8
102,Aleksandar Mitrović,7,Fulham,28.2,950.0,1125.0,850.0,7
225,João Cancelo,7,Manchester City,27.5,1000.0,1225.0,900.0,7
190,Roberto Firmino,6,Liverpool,27.9,900.0,1100.0,866.666667,6
218,Nick Pope,7,Newcastle Utd,27.7,900.0,1066.666667,900.0,6
176,Diogo Dalot Teixeira,6,Manchester Utd,26.9,1033.333333,1066.666667,866.666667,6


In [88]:
#Update the home team name from abbreviations to full names

players_df_clean.columns.values

array(['chance_of_playing_next_round', 'chance_of_playing_this_round',
       'code_x', 'cost_change_event', 'cost_change_event_fall',
       'cost_change_start', 'cost_change_start_fall', 'dreamteam_count',
       'element_type', 'ep_next', 'ep_this', 'event_points', 'first_name',
       'form', 'in_dreamteam', 'news', 'news_added', 'now_cost', 'photo',
       'points_per_game', 'second_name', 'selected_by_percent', 'special',
       'status', 'team', 'team_code', 'total_points', 'transfers_in',
       'transfers_in_event', 'transfers_out', 'transfers_out_event',
       'value_form', 'value_season', 'web_name', 'minutes',
       'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'influence_rank', 'influence_rank_type',
       'creativity_rank', 'creativity_rank_type', 'threat_rank',
       'threat_ran

In [89]:
players_df_clean.chance_of_playing_next_round.unique()

array([100.,  25.,   0.,  50.,  75.])

## To do
Drop players with zero chance of playing??

In [90]:

prediction_df = players_df_clean[['position','assist_ave',
       'bonus_ave', 'clean_sheets_ave', 'creativity', 'ict_index', 'influence', 'goals_conceded_ave',
       'goals_scored_ave', 'minutes_ave', 'opp_team_name',
       'own_goals_ave', 'penalties_missed_ave', 'penalties_saved_ave',
       'red_cards_ave', 'saves_ave', 'yellow_cards_ave', 'transfers_in', 'transfers_out', 'threat', 'was_home',
       'start_label', 'GW', 'club_name', 'form', 'game_weather', 'team_strength', 'attack_strength',
       'defence_strength', 'home_team_strength', 'away_team_strength',
       'home_attack_strength', 'home_defence_strength',
       'away_attack_strength', 'away_defence_strength',
       'squad_average_age']]


In [91]:
prediction_df.rename(columns={'own_goals_ave': 'own_goals', 'assist_ave': 'assists', 'bonus_ave': 'bonus', 'clean_sheets_ave':'clean_sheets', 'goals_conceded_ave':'goals_conceded', 'goals_scored_ave':'goals_scored', 'minutes_ave':'minutes', 'penalties_missed_ave': 'penalties_missed', 'penalties_saved_ave':'penalties_saved', 'red_cards_ave':'red_cards', 'saves_ave':'saves', 'yellow_cards_ave':'yellow_cards'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df.rename(columns={'own_goals_ave': 'own_goals', 'assist_ave': 'assists', 'bonus_ave': 'bonus', 'clean_sheets_ave':'clean_sheets', 'goals_conceded_ave':'goals_conceded', 'goals_scored_ave':'goals_scored', 'minutes_ave':'minutes', 'penalties_missed_ave': 'penalties_missed', 'penalties_saved_ave':'penalties_saved', 'red_cards_ave':'red_cards', 'saves_ave':'saves', 'yellow_cards_ave':'yellow_cards'}, inplace=True)


In [92]:
prediction_df.head()

Unnamed: 0,position,assists,bonus,clean_sheets,creativity,ict_index,influence,goals_conceded,goals_scored,minutes,...,team_strength,attack_strength,defence_strength,home_team_strength,away_team_strength,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,squad_average_age
0,MID,0.0,0.0,0.285714,76.0,20.9,33.8,0.428571,0.0,49.571429,...,957.142857,1085.714286,857.142857,1000.0,900.0,1100.0,925.0,1066.666667,766.666667,27.0
1,FWD,0.0,0.142857,0.142857,26.0,16.3,34.0,0.428571,0.142857,36.571429,...,957.142857,1085.714286,857.142857,1000.0,900.0,1100.0,925.0,1066.666667,766.666667,27.0
2,GK,0.0,0.285714,0.142857,0.0,19.8,197.6,1.428571,0.0,90.0,...,957.142857,1085.714286,857.142857,1000.0,900.0,1100.0,925.0,1066.666667,766.666667,27.0
3,DEF,0.0,0.0,0.0,0.6,3.1,30.0,0.142857,0.0,13.0,...,957.142857,1085.714286,857.142857,1000.0,900.0,1100.0,925.0,1066.666667,766.666667,27.0
4,DEF,0.0,0.142857,0.142857,123.2,27.3,111.6,1.428571,0.0,90.0,...,957.142857,1085.714286,857.142857,1000.0,900.0,1100.0,925.0,1066.666667,766.666667,27.0


In [93]:
prediction_df.columns

Index(['position', 'assists', 'bonus', 'clean_sheets', 'creativity',
       'ict_index', 'influence', 'goals_conceded', 'goals_scored', 'minutes',
       'opp_team_name', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'saves', 'yellow_cards', 'transfers_in', 'transfers_out',
       'threat', 'was_home', 'start_label', 'GW', 'club_name', 'form',
       'game_weather', 'team_strength', 'attack_strength', 'defence_strength',
       'home_team_strength', 'away_team_strength', 'home_attack_strength',
       'home_defence_strength', 'away_attack_strength',
       'away_defence_strength', 'squad_average_age'],
      dtype='object')

In [None]:
# Convert dataframe to a dictionary.
prediction_df_dict = prediction_df.to_dict(orient='records')

In [None]:
# Transform data.
prediction_encoded = dv.transform(prediction_df_dict)


In [None]:
# Convert array returned from dictvectorizer to a dataframe.
prediction_transformed = pd.DataFrame(prediction_encoded, columns=dv.feature_names_)

prediction_transformed.head()

In [None]:
# Check the shape of the dataframe.
prediction_transformed.shape

In [None]:
# Transform test data.
prediction_norm = scaler.transform(prediction_transformed)

In [None]:
predicted_New = model.predict(prediction_norm)

In [None]:
player_details = player_details.reset_index()

In [None]:
df_predicted = pd.Series(predicted_New)

In [None]:
df_predicted

In [None]:
df_all = pd.concat([player_details, df_predicted], axis=1)
df_all.head()