### ACCESSING DATA - TEST DATA 

#### Data Quality and Tidiness Issues

In [1]:
import pickle
import requests
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
#get current season data from FPL API endpoints and identify the keys
fpl_base_url = 'https://fantasy.premierleague.com/api/'
current_season = requests.get(fpl_base_url+'bootstrap-static/').json()
#json = r.json()
current_season.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [3]:
#create dataframes for the current season dictionary keys for data exploration
#- Contains summary of Gameweek data
events_df = pd.DataFrame(current_season['events']) #
phases_df = pd.DataFrame(current_season['phases']) #Shows calendar months for game weeks
teams_df = pd.DataFrame(current_season['teams'])
players_df = pd.DataFrame(current_season['elements'])
element_stats_df = pd.DataFrame(current_season['element_stats'])
element_types_df = pd.DataFrame(current_season['element_types'])

#Code to extract weekly game data for all active players in current season
for x in players_df.index :
    print(x)
    player_id = players_df.id[x]
    url = f'https://fantasy.premierleague.com/api/element-summary/{player_id}/'
    r = requests.get(url)
    json = r.json()
    json_history_df = pd.DataFrame(json['history'])
    json_history_past_df = pd.DataFrame(json['history_past'])

       
    if x == 0 :
        all_history_df = json_history_df
        all_history_past_df = json_history_past_df
    else : 
        all_history_df = all_history_df.append(json_history_df)
        all_history_past_df = all_history_past_df.append(json_history_past_df)

In [4]:
#Code to save the all players game week data in current and past seasons to csvs
#all_history_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/current_season.csv')
#all_history_past_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/past_seasons.csv')

In [5]:
df_currentseason = pd.read_csv('current_season.csv')

df_currentseason.head()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
0,0,1,1,7,0,False,2022-08-05T19:00:00Z,0,2,1,...,0,0.0,0.0,0.0,0.0,45,0,23970,0,0
1,1,1,11,10,0,True,2022-08-13T14:00:00Z,4,2,2,...,0,0.0,0.0,0.0,0.0,44,-5169,24193,1361,6530
2,2,1,21,3,0,False,2022-08-20T16:30:00Z,0,3,3,...,0,0.0,0.0,0.0,0.0,44,-4337,20960,879,5216
3,3,1,31,9,0,True,2022-08-27T16:30:00Z,2,1,4,...,0,0.0,0.0,0.0,0.0,43,-2988,18825,577,3565
4,4,1,41,2,0,True,2022-08-31T18:30:00Z,2,1,5,...,0,0.0,0.0,0.0,0.0,43,-1611,17790,405,2016


In [6]:
df_currentseason.shape #Test Data

(3579, 32)

In [7]:
#get current season fixtures from FPL API endpoint and create Dataframe

current_season_fixtures = requests.get(fpl_base_url+'fixtures/').json()
fixtures_df = pd.DataFrame(current_season_fixtures)
fixtures_df.head()

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2292871,,False,False,61,,0,False,,8,,1,,[],2,4,74971
1,2292921,,False,False,111,,0,False,,13,,1,,[],5,4,75021
2,2292870,,False,False,62,,0,False,,5,,3,,[],3,2,74972
3,2292882,,False,False,73,,0,False,,7,,5,,[],2,3,74983
4,2292883,,False,False,74,,0,False,,12,,6,,[],4,4,74984


### CLEANING DATA

In [8]:
# Make a copy of the original piece of test data.
df_currentseason_clean = df_currentseason.copy()
players_df_clean = players_df.copy()
fixtures_df_clean = fixtures_df.copy()

In [9]:
#Map the team names and the player positions into the players_df_clean dataframe
teams_now=dict(zip(teams_df.id, teams_df.short_name))
positions=dict(zip(element_types_df.id, element_types_df.singular_name_short))
players_df_clean['club_name'] = players_df_clean['team'].map(teams_now)
players_df_clean['position'] = players_df_clean['element_type'].map(positions)

In [10]:
# #Update the club names from abbreviations to full names
# players_df_clean["club_name"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
#        'LEI': 'Leicester', 'LEE': 'Leeds', 'LIV': 'Liverpool', 'MCI': 'Man City', 'MUN': 'Man Utd', 'NEW': 'Newcastle', 'NFO': 'Nottingham Forest', 'SOU': 'Southampton', 'TOT': 'Spurs',
#        'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
# players_df_clean.club_name

In [11]:
#create the player name feature
players_df_clean['name'] = players_df_clean['first_name'] + ' ' + players_df_clean['second_name']

In [12]:
#Create season_x feature to align with the train data
df_currentseason_clean['season_x'] = df_currentseason_clean.apply(lambda x: "2022-23", axis=1)

In [13]:
df_currentseason_clean.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x'],
      dtype='object')

In [14]:
players_df_clean[['id', 'first_name', 'second_name', 'name','club_name', 'minutes', 'form',  'bonus', 'bps', 'total_points', 'value_season', 'value_form']].head()

Unnamed: 0,id,first_name,second_name,name,club_name,minutes,form,bonus,bps,total_points,value_season,value_form
0,1,Cédric,Alves Soares,Cédric Alves Soares,ARS,0,0.0,0,0,0,0.0,0.0
1,3,Granit,Xhaka,Granit Xhaka,ARS,537,3.0,2,113,26,5.2,0.6
2,4,Mohamed,Elneny,Mohamed Elneny,ARS,90,0.5,0,15,2,0.5,0.1
3,5,Rob,Holding,Rob Holding,ARS,3,0.5,0,8,2,0.5,0.1
4,6,Thomas,Partey,Thomas Partey,ARS,270,0.8,0,39,8,1.7,0.2


In [15]:
#Map the team names, player names and form into the all current season data player dataframe
teams_map=dict(zip(players_df_clean.id, players_df_clean.name))
club_map=dict(zip(players_df_clean.id, players_df_clean.club_name))
opp_teams_map=dict(zip(players_df_clean.team, players_df_clean.club_name))
form_map=dict(zip(players_df_clean.id, players_df_clean.form))
position_map=dict(zip(players_df_clean.id, players_df_clean.position))
df_currentseason_clean['name'] = df_currentseason_clean['element'].map(teams_map)
df_currentseason_clean['club_name'] = df_currentseason_clean['element'].map(club_map)
df_currentseason_clean['opp_team_name'] = df_currentseason_clean['opponent_team'].map(opp_teams_map)
df_currentseason_clean['form'] = df_currentseason_clean['element'].map(form_map)
df_currentseason_clean['position'] = df_currentseason_clean['element'].map(form_map)

In [16]:
players_df_clean[['web_name', 'club_name']].head()

Unnamed: 0,web_name,club_name
0,Cédric,ARS
1,Xhaka,ARS
2,Elneny,ARS
3,Holding,ARS
4,Partey,ARS


In [17]:
df_currentseason_clean.drop(['Unnamed: 0'], axis=1, inplace=True)

In [18]:
play_zero_minutes = df_currentseason_clean[df_currentseason_clean.minutes == 0].index

In [19]:
df_currentseason_clean.drop(play_zero_minutes, axis = 0, inplace=True)

In [20]:
df_currentseason_clean.shape

(1768, 37)

In [21]:
df_currentseason_clean.rename(columns= { 'round': 'GW' }, inplace=True)

In [22]:
df_currentseason_clean.columns

Index(['element', 'fixture', 'opponent_team', 'total_points', 'was_home',
       'kickoff_time', 'team_h_score', 'team_a_score', 'GW', 'minutes',
       'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x', 'name', 'club_name',
       'opp_team_name', 'form', 'position'],
      dtype='object')

In [23]:
df_currentseason_clean.kickoff_time

6       2022-08-05T19:00:00Z
7       2022-08-13T14:00:00Z
8       2022-08-20T16:30:00Z
9       2022-08-27T16:30:00Z
10      2022-08-31T18:30:00Z
                ...         
3574    2022-08-20T11:30:00Z
3575    2022-08-28T13:00:00Z
3576    2022-08-31T18:30:00Z
3577    2022-09-03T14:00:00Z
3578    2022-09-03T14:00:00Z
Name: kickoff_time, Length: 1768, dtype: object

In [24]:
df_currentseason_clean['game_date'] = df_currentseason_clean['kickoff_time'].str.replace('T', ' ')
df_currentseason_clean['game_date'] = df_currentseason_clean['game_date'].str.replace(':00Z', '')

In [25]:
df_currentseason_clean['game_date'] = pd.to_datetime(df_currentseason_clean['game_date'])

In [26]:
df_currentseason_clean.game_date

6      2022-08-05 19:00:00
7      2022-08-13 14:00:00
8      2022-08-20 16:30:00
9      2022-08-27 16:30:00
10     2022-08-31 18:30:00
               ...        
3574   2022-08-20 11:30:00
3575   2022-08-28 13:00:00
3576   2022-08-31 18:30:00
3577   2022-09-03 14:00:00
3578   2022-09-03 14:00:00
Name: game_date, Length: 1768, dtype: datetime64[ns]

In [27]:
seasons_curr = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_curr_season = dict(zip(range(1,13), seasons_curr))
month_to_curr_season

{1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 1}

In [28]:
df_currentseason_clean['game_weather'] = df_currentseason_clean.game_date.dt.month.map(month_to_curr_season) 

In [29]:
df_currentseason_clean.game_weather.value_counts()

3    1446
4     322
Name: game_weather, dtype: int64

In [30]:
df_currentseason_clean.game_date.value_counts()

2022-09-03 14:00:00    179
2022-08-27 14:00:00    153
2022-08-13 14:00:00    147
2022-08-06 14:00:00    118
2022-08-20 14:00:00    114
2022-08-31 18:30:00     89
2022-08-21 13:00:00     60
2022-08-30 18:30:00     59
2022-08-07 13:00:00     59
2022-08-28 13:00:00     59
2022-08-07 15:30:00     32
2022-08-13 16:30:00     32
2022-08-15 19:00:00     31
2022-08-13 11:30:00     31
2022-08-28 15:30:00     31
2022-08-27 16:30:00     30
2022-09-04 15:30:00     30
2022-08-20 16:30:00     30
2022-09-04 13:00:00     30
2022-08-06 16:30:00     30
2022-08-14 15:30:00     30
2022-08-20 11:30:00     30
2022-08-22 19:00:00     30
2022-08-06 11:30:00     29
2022-08-27 11:30:00     29
2022-08-05 19:00:00     29
2022-09-03 11:30:00     29
2022-08-30 18:45:00     29
2022-08-31 19:00:00     28
2022-08-14 13:00:00     28
2022-09-01 19:00:00     27
2022-08-21 15:30:00     27
2022-09-03 16:30:00     27
2022-08-30 19:00:00     26
2022-08-31 18:45:00     26
Name: game_date, dtype: int64

In [31]:
df_currentseason_clean.game_date.dt.hour.value_counts()

14    711
13    236
18    203
19    171
15    150
16    149
11    148
Name: game_date, dtype: int64

In [32]:
import numpy as np
df_currentseason_clean['start_label'] = np.where((df_currentseason_clean['game_date'].dt.hour) < 13, 0, 1)

In [33]:
df_currentseason_clean.start_label.value_counts()

1    1620
0     148
Name: start_label, dtype: int64

In [34]:
df_currentseason_clean[['game_date', 'start_label']].head(100)

Unnamed: 0,game_date,start_label
6,2022-08-05 19:00:00,1
7,2022-08-13 14:00:00,1
8,2022-08-20 16:30:00,1
9,2022-08-27 16:30:00,1
10,2022-08-31 18:30:00,1
...,...,...
170,2022-08-06 14:00:00,1
171,2022-08-13 11:30:00,0
172,2022-08-20 14:00:00,1
173,2022-08-28 13:00:00,1


In [35]:
# Engineer feature tp highlight the game year only.
df_currentseason_clean['year'] = df_currentseason_clean.game_date.dt.year

In [36]:
df_currentseason_clean.drop(['game_date', 'season_x'], axis=1, inplace=True)

df_currentseason_clean

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,GW,minutes,...,transfers_in,transfers_out,name,club_name,opp_team_name,form,position,game_weather,start_label,year
6,3,1,7,2,False,2022-08-05T19:00:00Z,0,2,1,90,...,0,0,Granit Xhaka,ARS,CRY,3.0,3.0,3,1,2022
7,3,11,10,12,True,2022-08-13T14:00:00Z,4,2,2,90,...,9001,9630,Granit Xhaka,ARS,LEI,3.0,3.0,3,1,2022
8,3,21,3,6,False,2022-08-20T16:30:00Z,0,3,3,87,...,137326,25286,Granit Xhaka,ARS,BOU,3.0,3.0,3,1,2022
9,3,31,9,2,True,2022-08-27T16:30:00Z,2,1,4,90,...,77459,34699,Granit Xhaka,ARS,FUL,3.0,3.0,3,1,2022
10,3,41,2,2,True,2022-08-31T18:30:00Z,2,1,5,90,...,49435,38654,Granit Xhaka,ARS,AVL,3.0,3.0,3,1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,589,29,18,2,False,2022-08-20T11:30:00Z,1,0,3,90,...,6482,640,Matheus Luiz Nunes,WOL,TOT,4.0,4.0,3,0,2022
3575,589,40,15,2,True,2022-08-28T13:00:00Z,1,1,4,90,...,11496,2756,Matheus Luiz Nunes,WOL,NEW,4.0,4.0,3,1,2022
3576,589,42,3,3,False,2022-08-31T18:30:00Z,0,0,5,74,...,6340,2638,Matheus Luiz Nunes,WOL,BOU,4.0,4.0,3,1,2022
3577,589,60,17,9,True,2022-09-03T14:00:00Z,1,0,6,90,...,3796,3282,Matheus Luiz Nunes,WOL,SOU,4.0,4.0,4,1,2022


In [37]:
df_currentseason_clean.drop(['opponent_team', 'fixture', 'kickoff_time'], axis=1, inplace=True)

df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 6 to 3578
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   element            1768 non-null   int64  
 1   total_points       1768 non-null   int64  
 2   was_home           1768 non-null   bool   
 3   team_h_score       1768 non-null   int64  
 4   team_a_score       1768 non-null   int64  
 5   GW                 1768 non-null   int64  
 6   minutes            1768 non-null   int64  
 7   goals_scored       1768 non-null   int64  
 8   assists            1768 non-null   int64  
 9   clean_sheets       1768 non-null   int64  
 10  goals_conceded     1768 non-null   int64  
 11  own_goals          1768 non-null   int64  
 12  penalties_saved    1768 non-null   int64  
 13  penalties_missed   1768 non-null   int64  
 14  yellow_cards       1768 non-null   int64  
 15  red_cards          1768 non-null   int64  
 16  saves              1768 

In [38]:
df_currentseason_clean.form = df_currentseason_clean.form.astype(float)

In [39]:
df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 6 to 3578
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   element            1768 non-null   int64  
 1   total_points       1768 non-null   int64  
 2   was_home           1768 non-null   bool   
 3   team_h_score       1768 non-null   int64  
 4   team_a_score       1768 non-null   int64  
 5   GW                 1768 non-null   int64  
 6   minutes            1768 non-null   int64  
 7   goals_scored       1768 non-null   int64  
 8   assists            1768 non-null   int64  
 9   clean_sheets       1768 non-null   int64  
 10  goals_conceded     1768 non-null   int64  
 11  own_goals          1768 non-null   int64  
 12  penalties_saved    1768 non-null   int64  
 13  penalties_missed   1768 non-null   int64  
 14  yellow_cards       1768 non-null   int64  
 15  red_cards          1768 non-null   int64  
 16  saves              1768 

In [40]:
player_details = df_currentseason_clean[['name', 'total_points']]

In [41]:
#Assign seaso_x 
df_currentseason_clean.set_index('year', inplace=True)

df_currentseason_clean

Unnamed: 0_level_0,element,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,...,selected,transfers_in,transfers_out,name,club_name,opp_team_name,form,position,game_weather,start_label
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022,3,2,False,0,2,1,90,0,0,1,...,48303,0,0,Granit Xhaka,ARS,CRY,3.0,3.0,3,1
2022,3,12,True,4,2,2,90,1,1,0,...,65418,9001,9630,Granit Xhaka,ARS,LEI,3.0,3.0,3,1
2022,3,6,False,0,3,3,87,0,1,1,...,216726,137326,25286,Granit Xhaka,ARS,BOU,3.0,3.0,3,1
2022,3,2,True,2,1,4,90,0,0,0,...,267951,77459,34699,Granit Xhaka,ARS,FUL,3.0,3.0,3,1
2022,3,2,True,2,1,5,90,0,0,0,...,288460,49435,38654,Granit Xhaka,ARS,AVL,3.0,3.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,589,2,False,1,0,3,90,0,0,0,...,6801,6482,640,Matheus Luiz Nunes,WOL,TOT,4.0,4.0,3,0
2022,589,2,True,1,1,4,90,0,0,0,...,16866,11496,2756,Matheus Luiz Nunes,WOL,NEW,4.0,4.0,3,1
2022,589,3,False,0,0,5,74,0,0,1,...,21246,6340,2638,Matheus Luiz Nunes,WOL,BOU,4.0,4.0,3,1
2022,589,9,True,1,0,6,90,0,1,1,...,22523,3796,3282,Matheus Luiz Nunes,WOL,SOU,4.0,4.0,4,1


In [42]:
df_currentseason_clean[['name', 'club_name', 'element', 'opp_team_name', 'form', 'total_points', 'team_h_score', 'team_a_score', 'GW', 'was_home']].head()

Unnamed: 0_level_0,name,club_name,element,opp_team_name,form,total_points,team_h_score,team_a_score,GW,was_home
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022,Granit Xhaka,ARS,3,CRY,3.0,2,0,2,1,False
2022,Granit Xhaka,ARS,3,LEI,3.0,12,4,2,2,True
2022,Granit Xhaka,ARS,3,BOU,3.0,6,0,3,3,False
2022,Granit Xhaka,ARS,3,FUL,3.0,2,2,1,4,True
2022,Granit Xhaka,ARS,3,AVL,3.0,2,2,1,5,True


In [43]:
players_df_clean.club_name

0      ARS
1      ARS
2      ARS
3      ARS
4      ARS
      ... 
622    WOL
623    WOL
624    WOL
625    WOL
626    WOL
Name: club_name, Length: 627, dtype: object

In [44]:
#Cleaning the current season dataframes (null values, time series operations for dates)
players_df_clean.chance_of_playing_next_round = players_df_clean.chance_of_playing_next_round.fillna(100.0)
players_df_clean.chance_of_playing_this_round = players_df_clean.chance_of_playing_this_round.fillna(100.0)
players_df_clean.corners_and_indirect_freekicks_order = players_df_clean.corners_and_indirect_freekicks_order.fillna(0)
players_df_clean.direct_freekicks_order = players_df_clean.direct_freekicks_order.fillna(0)
players_df_clean.penalties_order = players_df_clean.penalties_order.fillna(0)
players_df_clean.drop(['id', 'squad_number'], axis=1, inplace=True)
fixtures_df_clean.team_a_score = fixtures_df_clean.team_a_score.fillna(0)
fixtures_df_clean.team_h_score = fixtures_df_clean.team_h_score.fillna(0)
fixtures_df_clean.drop(['stats', 'id'], axis=1, inplace=True)
fixtures_df_clean['kickoff_time'] = fixtures_df_clean['kickoff_time'].str[:-10]
fixtures_df_clean['kickoff_time'] = pd.to_datetime(fixtures_df_clean['kickoff_time'])

In [45]:
#Create Next Game week fixtures
GWfixtures_df = fixtures_df_clean.loc[(fixtures_df_clean['event'] == 8)].copy()
GWfixtures_df.shape

(7, 15)

In [46]:
#Mapping away and home team and their respective strengths
home_strength= dict(zip(teams_df.id, teams_df.strength_overall_home))
away_strength= dict(zip(teams_df.id, teams_df.strength_overall_away))
GWfixtures_df['away_team'] = GWfixtures_df['team_a'].map(teams_now)
GWfixtures_df['home_team'] = GWfixtures_df['team_h'].map(teams_now)
GWfixtures_df['away_team_strength'] = GWfixtures_df['team_a'].map(away_strength)
GWfixtures_df['home_team_strength'] = GWfixtures_df['team_h'].map(home_strength)
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,away_team_strength,home_team_strength
74,2292880,8.0,False,False,2022-09-16,0,False,False,17,0.0,2,0.0,2,2,74981,SOU,AVL,1100,1090
75,2292887,8.0,False,False,2022-09-16,0,False,False,9,0.0,16,0.0,2,2,74988,FUL,NFO,1090,1045
76,2292889,8.0,False,False,2022-09-17,0,False,False,13,0.0,20,0.0,5,2,74990,MCI,WOL,1370,1100
77,2292886,8.0,False,False,2022-09-17,0,False,False,3,0.0,15,0.0,2,3,74987,BOU,NEW,1070,1110
78,2292888,8.0,False,False,2022-09-17,0,False,False,10,0.0,18,0.0,2,4,74989,LEI,TOT,1100,1210
79,2292881,8.0,False,False,2022-09-18,0,False,False,1,0.0,4,0.0,3,3,74982,ARS,BRE,1270,1100
80,2292884,8.0,False,False,2022-09-18,0,False,False,19,0.0,8,0.0,2,2,74985,WHU,EVE,1150,1080


In [47]:
GWfixtures_df.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'kickoff_time',
       'minutes', 'provisional_start_time', 'started', 'team_a',
       'team_a_score', 'team_h', 'team_h_score', 'team_h_difficulty',
       'team_a_difficulty', 'pulse_id', 'away_team', 'home_team',
       'away_team_strength', 'home_team_strength'],
      dtype='object')

In [48]:
players_df_clean.columns

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round', 'code',
       'cost_change_event', 'cost_change_event_fall', 'cost_change_start',
       'cost_change_start_fall', 'dreamteam_count', 'element_type', 'ep_next',
       'ep_this', 'event_points', 'first_name', 'form', 'in_dreamteam', 'news',
       'news_added', 'now_cost', 'photo', 'points_per_game', 'second_name',
       'selected_by_percent', 'special', 'status', 'team', 'team_code',
       'total_points', 'transfers_in', 'transfers_in_event', 'transfers_out',
       'transfers_out_event', 'value_form', 'value_season', 'web_name',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'influence_rank', 'influence_rank_type',
       'creativity_rank', 'creativity_rank_type', 'threat_rank',
       'threat_rank_type', 

In [49]:
#Create features for data analysis (player-opposition team, game week fixture difficulty index, player's club )
gw_away_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_a"])
gw_away_players['player_opp'] = gw_away_players['web_name'].map(str) + '-' + gw_away_players['home_team'].map(str)
gw_home_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_h"])
gw_home_players['player_opp'] = gw_home_players['web_name'].map(str) + '-' + gw_home_players['away_team'].map(str)
gw_away_players['diff'] = gw_away_players['away_team_strength'] - gw_away_players['home_team_strength']
gw_home_players['diff'] = gw_home_players['home_team_strength'] - gw_home_players['away_team_strength']
players_df_clean = gw_away_players.append(gw_home_players)
players_df_clean['player_club'] = players_df_clean['web_name'].map(str) + '-' + players_df_clean['club_name'].map(str)
players_df_clean.drop(['minutes_y'], axis=1, inplace=True)
players_df_clean.rename(columns = {'minutes_x':'minutes'}, inplace = True)
players_df_clean['ave_minutes'] = players_df_clean['minutes']/6

  players_df_clean = gw_away_players.append(gw_home_players)


In [50]:
df_currentseason_clean.head()

Unnamed: 0_level_0,element,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,...,selected,transfers_in,transfers_out,name,club_name,opp_team_name,form,position,game_weather,start_label
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022,3,2,False,0,2,1,90,0,0,1,...,48303,0,0,Granit Xhaka,ARS,CRY,3.0,3.0,3,1
2022,3,12,True,4,2,2,90,1,1,0,...,65418,9001,9630,Granit Xhaka,ARS,LEI,3.0,3.0,3,1
2022,3,6,False,0,3,3,87,0,1,1,...,216726,137326,25286,Granit Xhaka,ARS,BOU,3.0,3.0,3,1
2022,3,2,True,2,1,4,90,0,0,0,...,267951,77459,34699,Granit Xhaka,ARS,FUL,3.0,3.0,3,1
2022,3,2,True,2,1,5,90,0,0,0,...,288460,49435,38654,Granit Xhaka,ARS,AVL,3.0,3.0,3,1


In [51]:
df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 2022 to 2022
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   element            1768 non-null   int64  
 1   total_points       1768 non-null   int64  
 2   was_home           1768 non-null   bool   
 3   team_h_score       1768 non-null   int64  
 4   team_a_score       1768 non-null   int64  
 5   GW                 1768 non-null   int64  
 6   minutes            1768 non-null   int64  
 7   goals_scored       1768 non-null   int64  
 8   assists            1768 non-null   int64  
 9   clean_sheets       1768 non-null   int64  
 10  goals_conceded     1768 non-null   int64  
 11  own_goals          1768 non-null   int64  
 12  penalties_saved    1768 non-null   int64  
 13  penalties_missed   1768 non-null   int64  
 14  yellow_cards       1768 non-null   int64  
 15  red_cards          1768 non-null   int64  
 16  saves              17

In [52]:
df_test = df_currentseason_clean.copy()

In [53]:
df_test.drop(['element', 'name'], axis = 1, inplace=True)

In [54]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 2022 to 2022
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   total_points       1768 non-null   int64  
 1   was_home           1768 non-null   bool   
 2   team_h_score       1768 non-null   int64  
 3   team_a_score       1768 non-null   int64  
 4   GW                 1768 non-null   int64  
 5   minutes            1768 non-null   int64  
 6   goals_scored       1768 non-null   int64  
 7   assists            1768 non-null   int64  
 8   clean_sheets       1768 non-null   int64  
 9   goals_conceded     1768 non-null   int64  
 10  own_goals          1768 non-null   int64  
 11  penalties_saved    1768 non-null   int64  
 12  penalties_missed   1768 non-null   int64  
 13  yellow_cards       1768 non-null   int64  
 14  red_cards          1768 non-null   int64  
 15  saves              1768 non-null   int64  
 16  bonus              17

In [55]:
df_test

Unnamed: 0_level_0,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,goals_conceded,...,transfers_balance,selected,transfers_in,transfers_out,club_name,opp_team_name,form,position,game_weather,start_label
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022,2,False,0,2,1,90,0,0,1,0,...,0,48303,0,0,ARS,CRY,3.0,3.0,3,1
2022,12,True,4,2,2,90,1,1,0,2,...,-629,65418,9001,9630,ARS,LEI,3.0,3.0,3,1
2022,6,False,0,3,3,87,0,1,1,0,...,112040,216726,137326,25286,ARS,BOU,3.0,3.0,3,1
2022,2,True,2,1,4,90,0,0,0,1,...,42760,267951,77459,34699,ARS,FUL,3.0,3.0,3,1
2022,2,True,2,1,5,90,0,0,0,1,...,10781,288460,49435,38654,ARS,AVL,3.0,3.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,2,False,1,0,3,90,0,0,0,1,...,5842,6801,6482,640,WOL,TOT,4.0,4.0,3,0
2022,2,True,1,1,4,90,0,0,0,1,...,8740,16866,11496,2756,WOL,NEW,4.0,4.0,3,1
2022,3,False,0,0,5,74,0,0,1,0,...,3702,21246,6340,2638,WOL,BOU,4.0,4.0,3,1
2022,9,True,1,0,6,90,0,1,1,0,...,514,22523,3796,3282,WOL,SOU,4.0,4.0,4,1


### ENCODING CATEGORICAL FEATURES

- Encoding will be carried out with a feature extraction class in sklearn called `dictvectorizer`

In [56]:
# Descriptive info of categorical features.
df_test[['position', 'opp_team_name', 'club_name', 'was_home']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 2022 to 2022
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   position       1768 non-null   object
 1   opp_team_name  1768 non-null   object
 2   club_name      1768 non-null   object
 3   was_home       1768 non-null   bool  
dtypes: bool(1), object(3)
memory usage: 57.0+ KB


In [57]:
# Convert dataframe to a dictionary.
df_test_dict = df_test.to_dict(orient='records')

In [58]:
df_test_dict[0]

{'total_points': 2,
 'was_home': False,
 'team_h_score': 0,
 'team_a_score': 2,
 'GW': 1,
 'minutes': 90,
 'goals_scored': 0,
 'assists': 0,
 'clean_sheets': 1,
 'goals_conceded': 0,
 'own_goals': 0,
 'penalties_saved': 0,
 'penalties_missed': 0,
 'yellow_cards': 1,
 'red_cards': 0,
 'saves': 0,
 'bonus': 0,
 'bps': 12,
 'influence': 16.6,
 'creativity': 15.0,
 'threat': 2.0,
 'ict_index': 3.4,
 'value': 50,
 'transfers_balance': 0,
 'selected': 48303,
 'transfers_in': 0,
 'transfers_out': 0,
 'club_name': 'ARS',
 'opp_team_name': 'CRY',
 'form': 3.0,
 'position': '3.0',
 'game_weather': 3,
 'start_label': 1}

In [59]:
# Read dictvectorizer.
with open('./model/dv', 'rb') as f_in1:
    dv = pickle.load(f_in1)

In [60]:
# Transform data.
test_encoded = dv.transform(df_test_dict)

In [61]:
test_encoded

array([[ 1.,  0.,  0., ..., 50.,  0.,  1.],
       [ 2.,  1.,  2., ..., 50.,  1.,  0.],
       [ 3.,  1.,  0., ..., 50.,  0.,  0.],
       ...,
       [ 5.,  0.,  0., ..., 50.,  0.,  0.],
       [ 6.,  1.,  3., ..., 50.,  1.,  0.],
       [ 6.,  0.,  0., ..., 55.,  1.,  0.]])

In [62]:
# vocabulary
vocab = dv.vocabulary_

# show vocab
vocab

{'position=MID': 81,
 'assists': 1,
 'bonus': 2,
 'bps': 3,
 'clean_sheets': 4,
 'creativity': 36,
 'goals_conceded': 39,
 'goals_scored': 40,
 'ict_index': 41,
 'influence': 42,
 'minutes': 43,
 'opp_team_name=Spurs': 67,
 'own_goals': 75,
 'penalties_missed': 76,
 'penalties_saved': 77,
 'red_cards': 82,
 'saves': 83,
 'selected': 84,
 'team_a_score': 86,
 'team_h_score': 87,
 'threat': 88,
 'transfers_balance': 89,
 'transfers_in': 90,
 'transfers_out': 91,
 'value': 92,
 'was_home': 93,
 'yellow_cards': 94,
 'GW': 0,
 'club_name=EVE': 14,
 'form': 37,
 'game_weather': 38,
 'start_label': 85,
 'opp_team_name=Crystal Palace': 52,
 'club_name=LEI': 19,
 'position=DEF': 78,
 'opp_team_name=Man Utd': 61,
 'club_name=CHE': 12,
 'position=GK': 80,
 'opp_team_name=Chelsea': 51,
 'club_name=MUN': 23,
 'club_name=BOU': 8,
 'opp_team_name=Bournemouth': 46,
 'club_name=TOT': 31,
 'position=FWD': 79,
 'opp_team_name=West Brom': 72,
 'club_name=LIV': 20,
 'opp_team_name=Southampton': 66,
 'club_

In [63]:
# Check feature names.
dv.feature_names_

['GW',
 'assists',
 'bonus',
 'bps',
 'clean_sheets',
 'club_name=ARS',
 'club_name=AVL',
 'club_name=BHA',
 'club_name=BOU',
 'club_name=BRE',
 'club_name=BUR',
 'club_name=CAR',
 'club_name=CHE',
 'club_name=CRY',
 'club_name=EVE',
 'club_name=FUL',
 'club_name=HUD',
 'club_name=HUL',
 'club_name=LEE',
 'club_name=LEI',
 'club_name=LIV',
 'club_name=MCI',
 'club_name=MID',
 'club_name=MUN',
 'club_name=NEW',
 'club_name=NOR',
 'club_name=SHU',
 'club_name=SOU',
 'club_name=STK',
 'club_name=SUN',
 'club_name=SWA',
 'club_name=TOT',
 'club_name=WAT',
 'club_name=WBA',
 'club_name=WHU',
 'club_name=WOL',
 'creativity',
 'form',
 'game_weather',
 'goals_conceded',
 'goals_scored',
 'ict_index',
 'influence',
 'minutes',
 'opp_team_name=Arsenal',
 'opp_team_name=Aston Villa',
 'opp_team_name=Bournemouth',
 'opp_team_name=Brentford',
 'opp_team_name=Brighton',
 'opp_team_name=Burnley',
 'opp_team_name=Cardiff',
 'opp_team_name=Chelsea',
 'opp_team_name=Crystal Palace',
 'opp_team_name=Eve

In [64]:
# Convert array returned from dictvectorizer to a dataframe.
test_transformed = pd.DataFrame(test_encoded, columns=dv.feature_names_)

test_transformed.head()

Unnamed: 0,GW,assists,bonus,bps,clean_sheets,club_name=ARS,club_name=AVL,club_name=BHA,club_name=BOU,club_name=BRE,...,start_label,team_a_score,team_h_score,threat,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards
0,1.0,0.0,0.0,12.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,2.0,0.0,0.0,0.0,50.0,0.0,1.0
1,2.0,1.0,2.0,35.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,2.0,4.0,28.0,-629.0,9001.0,9630.0,50.0,1.0,0.0
2,3.0,1.0,0.0,25.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,3.0,0.0,6.0,112040.0,137326.0,25286.0,50.0,0.0,0.0
3,4.0,0.0,0.0,9.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,12.0,42760.0,77459.0,34699.0,50.0,1.0,0.0
4,5.0,0.0,0.0,14.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,2.0,8.0,10781.0,49435.0,38654.0,50.0,1.0,0.0


In [65]:
# Check the shape of the dataframe.
test_transformed.shape

(1768, 95)

In [66]:
# Read in scaler.
with open('./model/min_max_scaler', 'rb') as f_in2:
    scaler = pickle.load(f_in2)

In [67]:
# Transform test data.
test_norm = scaler.transform(test_transformed)

In [68]:
test_norm

array([[0.        , 0.        , 0.        , ..., 0.13131313, 0.        ,
        1.        ],
       [0.02173913, 0.25      , 0.66666667, ..., 0.13131313, 1.        ,
        0.        ],
       [0.04347826, 0.25      , 0.        , ..., 0.13131313, 0.        ,
        0.        ],
       ...,
       [0.08695652, 0.        , 0.        , ..., 0.13131313, 0.        ,
        0.        ],
       [0.10869565, 0.25      , 1.        , ..., 0.13131313, 1.        ,
        0.        ],
       [0.10869565, 0.        , 0.        , ..., 0.18181818, 1.        ,
        0.        ]])

In [69]:
# Read in model.
with open('./model/rf_model.pkl', 'rb') as f_in3:
    model = pickle.load(f_in3)

In [70]:
# Utility function
def evaluate_model(model, x, y):
    """
    Utility function to print the model performance, (RMSE and R-Squared scores)
    model: Fitted model
    x: cross validation features dataset
    y: cross validation target values
    """
    predicted = model.predict(x) #get predictions
    RSME_score = mean_squared_error(y_true=y, y_pred=predicted, squared=False) #squared=False will RMSE instead of MSE
    R2_score = r2_score(y, predicted)
    
    print('RMSE:', RSME_score)
    print('R-Squared:', R2_score)
    print()

In [71]:
predicted = model.predict(test_norm)

In [72]:
predicted[2]

5.69

In [74]:
predicted.shape

(1768,)

In [75]:
player_details = player_details.reset_index()

In [76]:
player_details

Unnamed: 0,index,name,total_points
0,6,Granit Xhaka,2
1,7,Granit Xhaka,12
2,8,Granit Xhaka,6
3,9,Granit Xhaka,2
4,10,Granit Xhaka,2
...,...,...,...
1763,3574,Matheus Luiz Nunes,2
1764,3575,Matheus Luiz Nunes,2
1765,3576,Matheus Luiz Nunes,3
1766,3577,Matheus Luiz Nunes,9


In [77]:
player_details.drop(['index'], axis = 1, inplace=True)

In [78]:
df_predicted = pd.Series(predicted)

In [79]:
df_predicted

0        2.36
1       11.69
2        5.69
3        2.00
4        2.00
        ...  
1763     2.00
1764     2.00
1765     3.63
1766    10.83
1767     1.00
Length: 1768, dtype: float64

In [86]:
df_all = pd.concat([player_details, df_predicted], axis=1)
df_all.rename(columns={0: 'total_points_pred'}, inplace=True)

In [87]:
df_all.head(50)

Unnamed: 0,name,total_points,total_points_pred
0,Granit Xhaka,2,2.36
1,Granit Xhaka,12,11.69
2,Granit Xhaka,6,5.69
3,Granit Xhaka,2,2.0
4,Granit Xhaka,2,2.0
5,Granit Xhaka,2,2.0
6,Mohamed Elneny,2,2.0
7,Rob Holding,1,1.0
8,Rob Holding,1,1.0
9,Thomas Partey,3,4.77


In [89]:
RSME_score = mean_squared_error(y_true=df_test['total_points'], y_pred=predicted, squared=False) #squared=False will RMSE instead of MSE
R2_score = r2_score(df_test['total_points'], predicted)

print('RMSE:', RSME_score)
print('R-Squared:', R2_score)
print()

RMSE: 0.6796434885241031
R-Squared: 0.9469941167694479



In [94]:
player_details

Unnamed: 0_level_0,name,total_points
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2022,Granit Xhaka,2
2022,Granit Xhaka,12
2022,Granit Xhaka,6
2022,Granit Xhaka,2
2022,Granit Xhaka,2
...,...,...
2022,Matheus Luiz Nunes,2
2022,Matheus Luiz Nunes,2
2022,Matheus Luiz Nunes,3
2022,Matheus Luiz Nunes,9
