### ACCESSING DATA - TEST DATA 

#### Data Quality and Tidiness Issues

In [1]:
import pandas as pd
import requests
#get current season data from FPL API endpoints and identify the keys
fpl_base_url = 'https://fantasy.premierleague.com/api/'
current_season = requests.get(fpl_base_url+'bootstrap-static/').json()
#json = r.json()
current_season.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [2]:
#create dataframes for the current season dictionary keys for data exploration
#- Contains summary of Gameweek data
events_df = pd.DataFrame(current_season['events']) #
phases_df = pd.DataFrame(current_season['phases']) #Shows calendar months for game weeks
teams_df = pd.DataFrame(current_season['teams'])
players_df = pd.DataFrame(current_season['elements'])
element_stats_df = pd.DataFrame(current_season['element_stats'])
element_types_df = pd.DataFrame(current_season['element_types'])

#Code to extract weekly game data for all active players in current season
for x in players_df.index :
    print(x)
    player_id = players_df.id[x]
    url = f'https://fantasy.premierleague.com/api/element-summary/{player_id}/'
    r = requests.get(url)
    json = r.json()
    json_history_df = pd.DataFrame(json['history'])
    json_history_past_df = pd.DataFrame(json['history_past'])

       
    if x == 0 :
        all_history_df = json_history_df
        all_history_past_df = json_history_past_df
    else : 
        all_history_df = all_history_df.append(json_history_df)
        all_history_past_df = all_history_past_df.append(json_history_past_df)

In [3]:
#Code to save the all players game week data in current and past seasons to csvs
#all_history_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/current_season.csv')
#all_history_past_df.to_csv('/home/laniolao/fpl/FantasyPremierLeague/past_seasons.csv')

In [4]:
df_currentseason = pd.read_csv('current_season.csv')

df_currentseason.head()

Unnamed: 0.1,Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,...,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
0,0,1,1,7,0,False,2022-08-05T19:00:00Z,0,2,1,...,0,0.0,0.0,0.0,0.0,45,0,23970,0,0
1,1,1,11,10,0,True,2022-08-13T14:00:00Z,4,2,2,...,0,0.0,0.0,0.0,0.0,44,-5169,24193,1361,6530
2,2,1,21,3,0,False,2022-08-20T16:30:00Z,0,3,3,...,0,0.0,0.0,0.0,0.0,44,-4337,20960,879,5216
3,3,1,31,9,0,True,2022-08-27T16:30:00Z,2,1,4,...,0,0.0,0.0,0.0,0.0,43,-2988,18825,577,3565
4,4,1,41,2,0,True,2022-08-31T18:30:00Z,2,1,5,...,0,0.0,0.0,0.0,0.0,43,-1611,17790,405,2016


In [5]:
df_currentseason.shape #Test Data

(3579, 32)

In [6]:
#get current season fixtures from FPL API endpoint and create Dataframe

current_season_fixtures = requests.get(fpl_base_url+'fixtures/').json()
fixtures_df = pd.DataFrame(current_season_fixtures)
fixtures_df.head()

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2292871,,False,False,61,,0,False,,8,,1,,[],2,4,74971
1,2292870,,False,False,62,,0,False,,5,,3,,[],3,2,74972
2,2292882,,False,False,73,,0,False,,7,,5,,[],2,3,74983
3,2292872,,False,False,63,,0,False,,14,,7,,[],3,3,74973
4,2292873,,False,False,64,,0,False,,6,,9,,[],3,2,74974


### CLEANING DATA

In [7]:
# Make a copy of the original piece of test data.
df_currentseason_clean = df_currentseason.copy()
players_df_clean = players_df.copy()
fixtures_df_clean = fixtures_df.copy()

In [8]:
#Map the team names and the player positions into the players_df_clean dataframe
teams_now=dict(zip(teams_df.id, teams_df.short_name))
positions=dict(zip(element_types_df.id, element_types_df.singular_name_short))
players_df_clean['club_name'] = players_df_clean['team'].map(teams_now)
players_df_clean['position'] = players_df_clean['element_type'].map(positions)

In [9]:
#Update the club names from abbreviations to full names
players_df_clean["club_name"].replace({'ARS': 'Arsenal', 'AVL': 'Aston Villa', 'BOU': 'Bournemouth', 'BRE': 'Brentford', 'BHA': 'Brighton', 'CHE': 'Chelsea', 'CRY': 'Crystal Palace', 'EVE': 'Everton', 'FUL': 'Fulham',
       'LEI': 'Leicester', 'LEE': 'Leeds', 'LIV': 'Liverpool', 'MCI': 'Man City', 'MUN': 'Man Utd', 'NEW': 'Newcastle', 'NFO': 'Nottingham Forest', 'SOU': 'Southampton', 'TOT': 'Spurs',
       'WHU': 'West Ham', 'WOL': 'Wolves'}, inplace=True)
players_df_clean.club_name

0      Arsenal
1      Arsenal
2      Arsenal
3      Arsenal
4      Arsenal
        ...   
619     Wolves
620     Wolves
621     Wolves
622     Wolves
623     Wolves
Name: club_name, Length: 624, dtype: object

In [10]:
#create the player name feature
players_df_clean['name'] = players_df_clean['first_name'] + ' ' + players_df_clean['second_name']

In [11]:
#Create season_x feature to align with the train data
df_currentseason_clean['season_x'] = df_currentseason_clean.apply(lambda x: "2022-23", axis=1)

In [12]:
df_currentseason_clean.columns

Index(['Unnamed: 0', 'element', 'fixture', 'opponent_team', 'total_points',
       'was_home', 'kickoff_time', 'team_h_score', 'team_a_score', 'round',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x'],
      dtype='object')

In [13]:
players_df_clean[['id', 'first_name', 'second_name', 'name','club_name', 'minutes', 'form',  'bonus', 'bps', 'total_points', 'value_season', 'value_form']].head()

Unnamed: 0,id,first_name,second_name,name,club_name,minutes,form,bonus,bps,total_points,value_season,value_form
0,1,Cédric,Alves Soares,Cédric Alves Soares,Arsenal,0,0.0,0,0,0,0.0,0.0
1,3,Granit,Xhaka,Granit Xhaka,Arsenal,537,4.8,2,113,26,5.2,1.0
2,4,Mohamed,Elneny,Mohamed Elneny,Arsenal,90,0.4,0,15,2,0.5,0.1
3,5,Rob,Holding,Rob Holding,Arsenal,3,0.4,0,8,2,0.5,0.1
4,6,Thomas,Partey,Thomas Partey,Arsenal,270,1.0,0,39,8,1.7,0.2


In [14]:
#Map the team names, player names and form into the all current season data player dataframe
teams_map=dict(zip(players_df_clean.id, players_df_clean.name))
club_map=dict(zip(players_df_clean.id, players_df_clean.club_name))
opp_teams_map=dict(zip(players_df_clean.team, players_df_clean.club_name))
form_map=dict(zip(players_df_clean.id, players_df_clean.form))
position_map=dict(zip(players_df_clean.id, players_df_clean.position))
df_currentseason_clean['name'] = df_currentseason_clean['element'].map(teams_map)
df_currentseason_clean['club_name'] = df_currentseason_clean['element'].map(club_map)
df_currentseason_clean['opp_team_name'] = df_currentseason_clean['opponent_team'].map(opp_teams_map)
df_currentseason_clean['form'] = df_currentseason_clean['element'].map(form_map)
df_currentseason_clean['position'] = df_currentseason_clean['element'].map(form_map)

In [15]:
players_df_clean[['web_name', 'club_name']].head()

Unnamed: 0,web_name,club_name
0,Cédric,Arsenal
1,Xhaka,Arsenal
2,Elneny,Arsenal
3,Holding,Arsenal
4,Partey,Arsenal


In [16]:
df_currentseason_clean.drop(['Unnamed: 0'], axis=1, inplace=True)

In [17]:
play_zero_minutes = df_currentseason_clean[df_currentseason_clean.minutes == 0].index

In [18]:
df_currentseason_clean.drop(play_zero_minutes, axis = 0, inplace=True)

In [19]:
df_currentseason_clean.shape

(1768, 37)

In [20]:
df_currentseason_clean.rename(columns= { 'round': 'GW' }, inplace=True)

In [21]:
df_currentseason_clean.columns

Index(['element', 'fixture', 'opponent_team', 'total_points', 'was_home',
       'kickoff_time', 'team_h_score', 'team_a_score', 'GW', 'minutes',
       'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'value', 'transfers_balance', 'selected',
       'transfers_in', 'transfers_out', 'season_x', 'name', 'club_name',
       'opp_team_name', 'form', 'position'],
      dtype='object')

In [22]:
df_currentseason_clean.kickoff_time

6       2022-08-05T19:00:00Z
7       2022-08-13T14:00:00Z
8       2022-08-20T16:30:00Z
9       2022-08-27T16:30:00Z
10      2022-08-31T18:30:00Z
                ...         
3574    2022-08-20T11:30:00Z
3575    2022-08-28T13:00:00Z
3576    2022-08-31T18:30:00Z
3577    2022-09-03T14:00:00Z
3578    2022-09-03T14:00:00Z
Name: kickoff_time, Length: 1768, dtype: object

In [23]:
df_currentseason_clean['game_date'] = df_currentseason_clean['kickoff_time'].str.replace('T', ' ')
df_currentseason_clean['game_date'] = df_currentseason_clean['game_date'].str.replace(':00Z', '')

In [24]:
df_currentseason_clean['game_date'] = pd.to_datetime(df_currentseason_clean['game_date'])

In [25]:
df_currentseason_clean.game_date

6      2022-08-05 19:00:00
7      2022-08-13 14:00:00
8      2022-08-20 16:30:00
9      2022-08-27 16:30:00
10     2022-08-31 18:30:00
               ...        
3574   2022-08-20 11:30:00
3575   2022-08-28 13:00:00
3576   2022-08-31 18:30:00
3577   2022-09-03 14:00:00
3578   2022-09-03 14:00:00
Name: game_date, Length: 1768, dtype: datetime64[ns]

In [26]:
seasons_curr = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_curr_season = dict(zip(range(1,13), seasons_curr))
month_to_curr_season

{1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 1}

In [27]:
df_currentseason_clean['game_weather'] = df_currentseason_clean.game_date.dt.month.map(month_to_curr_season) 

In [28]:
df_currentseason_clean.game_weather.value_counts()

3    1446
4     322
Name: game_weather, dtype: int64

In [29]:
df_currentseason_clean.game_date.value_counts()

2022-09-03 14:00:00    179
2022-08-27 14:00:00    153
2022-08-13 14:00:00    147
2022-08-06 14:00:00    118
2022-08-20 14:00:00    114
2022-08-31 18:30:00     89
2022-08-21 13:00:00     60
2022-08-28 13:00:00     59
2022-08-30 18:30:00     59
2022-08-07 13:00:00     59
2022-08-13 16:30:00     32
2022-08-07 15:30:00     32
2022-08-15 19:00:00     31
2022-08-28 15:30:00     31
2022-08-13 11:30:00     31
2022-09-04 13:00:00     30
2022-08-20 11:30:00     30
2022-09-04 15:30:00     30
2022-08-14 15:30:00     30
2022-08-20 16:30:00     30
2022-08-27 16:30:00     30
2022-08-06 16:30:00     30
2022-08-22 19:00:00     30
2022-08-30 18:45:00     29
2022-09-03 11:30:00     29
2022-08-05 19:00:00     29
2022-08-06 11:30:00     29
2022-08-27 11:30:00     29
2022-08-31 19:00:00     28
2022-08-14 13:00:00     28
2022-08-21 15:30:00     27
2022-09-03 16:30:00     27
2022-09-01 19:00:00     27
2022-08-30 19:00:00     26
2022-08-31 18:45:00     26
Name: game_date, dtype: int64

In [30]:
df_currentseason_clean.game_date.dt.hour.value_counts()

14    711
13    236
18    203
19    171
15    150
16    149
11    148
Name: game_date, dtype: int64

In [31]:
import numpy as np
df_currentseason_clean['start_label'] = np.where((df_currentseason_clean['game_date'].dt.hour) < 13, 0, 1)

In [32]:
df_currentseason_clean.start_label.value_counts()

1    1620
0     148
Name: start_label, dtype: int64

In [33]:
df_currentseason_clean[['game_date', 'start_label']].head(100)

Unnamed: 0,game_date,start_label
6,2022-08-05 19:00:00,1
7,2022-08-13 14:00:00,1
8,2022-08-20 16:30:00,1
9,2022-08-27 16:30:00,1
10,2022-08-31 18:30:00,1
...,...,...
170,2022-08-06 14:00:00,1
171,2022-08-13 11:30:00,0
172,2022-08-20 14:00:00,1
173,2022-08-28 13:00:00,1


In [34]:
df_currentseason_clean.drop(['game_date'], axis=1, inplace=True)

df_currentseason_clean

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,GW,minutes,...,transfers_in,transfers_out,season_x,name,club_name,opp_team_name,form,position,game_weather,start_label
6,3,1,7,2,False,2022-08-05T19:00:00Z,0,2,1,90,...,0,0,2022-23,Granit Xhaka,Arsenal,Crystal Palace,4.8,4.8,3,1
7,3,11,10,12,True,2022-08-13T14:00:00Z,4,2,2,90,...,9001,9630,2022-23,Granit Xhaka,Arsenal,Leicester,4.8,4.8,3,1
8,3,21,3,6,False,2022-08-20T16:30:00Z,0,3,3,87,...,137326,25286,2022-23,Granit Xhaka,Arsenal,Bournemouth,4.8,4.8,3,1
9,3,31,9,2,True,2022-08-27T16:30:00Z,2,1,4,90,...,77459,34699,2022-23,Granit Xhaka,Arsenal,Fulham,4.8,4.8,3,1
10,3,41,2,2,True,2022-08-31T18:30:00Z,2,1,5,90,...,49435,38654,2022-23,Granit Xhaka,Arsenal,Aston Villa,4.8,4.8,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,589,29,18,2,False,2022-08-20T11:30:00Z,1,0,3,90,...,6482,640,2022-23,Matheus Luiz Nunes,Wolves,Spurs,4.0,4.0,3,0
3575,589,40,15,2,True,2022-08-28T13:00:00Z,1,1,4,90,...,11496,2756,2022-23,Matheus Luiz Nunes,Wolves,Newcastle,4.0,4.0,3,1
3576,589,42,3,3,False,2022-08-31T18:30:00Z,0,0,5,74,...,6340,2638,2022-23,Matheus Luiz Nunes,Wolves,Bournemouth,4.0,4.0,3,1
3577,589,60,17,9,True,2022-09-03T14:00:00Z,1,0,6,90,...,3796,3282,2022-23,Matheus Luiz Nunes,Wolves,Southampton,4.0,4.0,4,1


In [35]:
df_currentseason_clean.drop(['opponent_team', 'fixture', 'kickoff_time'], axis=1, inplace=True)

df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 6 to 3578
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   element            1768 non-null   int64  
 1   total_points       1768 non-null   int64  
 2   was_home           1768 non-null   bool   
 3   team_h_score       1768 non-null   int64  
 4   team_a_score       1768 non-null   int64  
 5   GW                 1768 non-null   int64  
 6   minutes            1768 non-null   int64  
 7   goals_scored       1768 non-null   int64  
 8   assists            1768 non-null   int64  
 9   clean_sheets       1768 non-null   int64  
 10  goals_conceded     1768 non-null   int64  
 11  own_goals          1768 non-null   int64  
 12  penalties_saved    1768 non-null   int64  
 13  penalties_missed   1768 non-null   int64  
 14  yellow_cards       1768 non-null   int64  
 15  red_cards          1768 non-null   int64  
 16  saves              1768 

In [36]:
df_currentseason_clean.position = df_currentseason_clean.position.astype(str)
df_currentseason_clean.opp_team_name = df_currentseason_clean.opp_team_name.astype(str)
df_currentseason_clean.club_name = df_currentseason_clean.club_name.astype(str)
df_currentseason_clean.name = df_currentseason_clean.name.astype(str)
df_currentseason_clean.was_home = df_currentseason_clean.was_home.astype(str)
df_currentseason_clean.form = df_currentseason_clean.form.astype(float)

In [37]:
df_currentseason_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 6 to 3578
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   element            1768 non-null   int64  
 1   total_points       1768 non-null   int64  
 2   was_home           1768 non-null   object 
 3   team_h_score       1768 non-null   int64  
 4   team_a_score       1768 non-null   int64  
 5   GW                 1768 non-null   int64  
 6   minutes            1768 non-null   int64  
 7   goals_scored       1768 non-null   int64  
 8   assists            1768 non-null   int64  
 9   clean_sheets       1768 non-null   int64  
 10  goals_conceded     1768 non-null   int64  
 11  own_goals          1768 non-null   int64  
 12  penalties_saved    1768 non-null   int64  
 13  penalties_missed   1768 non-null   int64  
 14  yellow_cards       1768 non-null   int64  
 15  red_cards          1768 non-null   int64  
 16  saves              1768 

In [38]:
#Assign seaso_x 
df_currentseason_clean.set_index('season_x', inplace=True)

df_currentseason_clean

Unnamed: 0_level_0,element,total_points,was_home,team_h_score,team_a_score,GW,minutes,goals_scored,assists,clean_sheets,...,selected,transfers_in,transfers_out,name,club_name,opp_team_name,form,position,game_weather,start_label
season_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-23,3,2,False,0,2,1,90,0,0,1,...,48303,0,0,Granit Xhaka,Arsenal,Crystal Palace,4.8,4.8,3,1
2022-23,3,12,True,4,2,2,90,1,1,0,...,65418,9001,9630,Granit Xhaka,Arsenal,Leicester,4.8,4.8,3,1
2022-23,3,6,False,0,3,3,87,0,1,1,...,216726,137326,25286,Granit Xhaka,Arsenal,Bournemouth,4.8,4.8,3,1
2022-23,3,2,True,2,1,4,90,0,0,0,...,267951,77459,34699,Granit Xhaka,Arsenal,Fulham,4.8,4.8,3,1
2022-23,3,2,True,2,1,5,90,0,0,0,...,288460,49435,38654,Granit Xhaka,Arsenal,Aston Villa,4.8,4.8,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-23,589,2,False,1,0,3,90,0,0,0,...,6801,6482,640,Matheus Luiz Nunes,Wolves,Spurs,4.0,4.0,3,0
2022-23,589,2,True,1,1,4,90,0,0,0,...,16866,11496,2756,Matheus Luiz Nunes,Wolves,Newcastle,4.0,4.0,3,1
2022-23,589,3,False,0,0,5,74,0,0,1,...,21246,6340,2638,Matheus Luiz Nunes,Wolves,Bournemouth,4.0,4.0,3,1
2022-23,589,9,True,1,0,6,90,0,1,1,...,22523,3796,3282,Matheus Luiz Nunes,Wolves,Southampton,4.0,4.0,4,1


In [39]:
df_currentseason_clean[['name', 'club_name', 'element', 'opp_team_name', 'form', 'total_points', 'team_h_score', 'team_a_score', 'GW', 'was_home']].head()

Unnamed: 0_level_0,name,club_name,element,opp_team_name,form,total_points,team_h_score,team_a_score,GW,was_home
season_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-23,Granit Xhaka,Arsenal,3,Crystal Palace,4.8,2,0,2,1,False
2022-23,Granit Xhaka,Arsenal,3,Leicester,4.8,12,4,2,2,True
2022-23,Granit Xhaka,Arsenal,3,Bournemouth,4.8,6,0,3,3,False
2022-23,Granit Xhaka,Arsenal,3,Fulham,4.8,2,2,1,4,True
2022-23,Granit Xhaka,Arsenal,3,Aston Villa,4.8,2,2,1,5,True


In [40]:
players_df_clean.club_name

0      Arsenal
1      Arsenal
2      Arsenal
3      Arsenal
4      Arsenal
        ...   
619     Wolves
620     Wolves
621     Wolves
622     Wolves
623     Wolves
Name: club_name, Length: 624, dtype: object

In [41]:
#Cleaning the current season dataframes (null values, time series operations for dates)
players_df_clean.chance_of_playing_next_round = players_df_clean.chance_of_playing_next_round.fillna(100.0)
players_df_clean.chance_of_playing_this_round = players_df_clean.chance_of_playing_this_round.fillna(100.0)
players_df_clean.corners_and_indirect_freekicks_order = players_df_clean.corners_and_indirect_freekicks_order.fillna(0)
players_df_clean.direct_freekicks_order = players_df_clean.direct_freekicks_order.fillna(0)
players_df_clean.penalties_order = players_df_clean.penalties_order.fillna(0)
players_df_clean.drop(['id', 'squad_number'], axis=1, inplace=True)
fixtures_df_clean.team_a_score = fixtures_df_clean.team_a_score.fillna(0)
fixtures_df_clean.team_h_score = fixtures_df_clean.team_h_score.fillna(0)
fixtures_df_clean.drop(['stats', 'id'], axis=1, inplace=True)
fixtures_df_clean['kickoff_time'] = fixtures_df_clean['kickoff_time'].str[:-10]
fixtures_df_clean['kickoff_time'] = pd.to_datetime(fixtures_df_clean['kickoff_time'])

In [42]:
#Create Next Game week fixtures
GWfixtures_df = fixtures_df_clean.loc[(fixtures_df_clean['event'] == 8)].copy()
GWfixtures_df.shape

(9, 15)

In [43]:
#Mapping away and home team and their respective strengths
home_strength= dict(zip(teams_df.id, teams_df.strength_overall_home))
away_strength= dict(zip(teams_df.id, teams_df.strength_overall_away))
GWfixtures_df['away_team'] = GWfixtures_df['team_a'].map(teams_now)
GWfixtures_df['home_team'] = GWfixtures_df['team_h'].map(teams_now)
GWfixtures_df['away_team_strength'] = GWfixtures_df['team_a'].map(away_strength)
GWfixtures_df['home_team_strength'] = GWfixtures_df['team_h'].map(home_strength)
GWfixtures_df.head(10)

Unnamed: 0,code,event,finished,finished_provisional,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,away_team_strength,home_team_strength
71,2292880,8.0,False,False,2022-09-16,0,False,False,17,0.0,2,0.0,2,2,74981,SOU,AVL,1100,1090
72,2292887,8.0,False,False,2022-09-16,0,False,False,9,0.0,16,0.0,2,2,74988,FUL,NFO,1090,1045
73,2292889,8.0,False,False,2022-09-17,0,False,False,13,0.0,20,0.0,5,2,74990,MCI,WOL,1370,1100
74,2292886,8.0,False,False,2022-09-17,0,False,False,3,0.0,15,0.0,2,3,74987,BOU,NEW,1070,1110
75,2292888,8.0,False,False,2022-09-17,0,False,False,10,0.0,18,0.0,2,4,74989,LEI,TOT,1100,1210
76,2292881,8.0,False,False,2022-09-18,0,False,False,1,0.0,4,0.0,3,3,74982,ARS,BRE,1270,1100
77,2292884,8.0,False,False,2022-09-18,0,False,False,19,0.0,8,0.0,2,2,74985,WHU,EVE,1150,1080
78,2292885,8.0,False,False,2022-09-18,0,False,False,11,0.0,14,0.0,2,4,74986,LEE,MUN,1100,1150
79,2292883,8.0,False,False,2022-09-18,0,False,False,12,0.0,6,0.0,4,4,74984,LIV,CHE,1350,1210


In [44]:
GWfixtures_df.columns

Index(['code', 'event', 'finished', 'finished_provisional', 'kickoff_time',
       'minutes', 'provisional_start_time', 'started', 'team_a',
       'team_a_score', 'team_h', 'team_h_score', 'team_h_difficulty',
       'team_a_difficulty', 'pulse_id', 'away_team', 'home_team',
       'away_team_strength', 'home_team_strength'],
      dtype='object')

In [45]:
players_df_clean.columns

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round', 'code',
       'cost_change_event', 'cost_change_event_fall', 'cost_change_start',
       'cost_change_start_fall', 'dreamteam_count', 'element_type', 'ep_next',
       'ep_this', 'event_points', 'first_name', 'form', 'in_dreamteam', 'news',
       'news_added', 'now_cost', 'photo', 'points_per_game', 'second_name',
       'selected_by_percent', 'special', 'status', 'team', 'team_code',
       'total_points', 'transfers_in', 'transfers_in_event', 'transfers_out',
       'transfers_out_event', 'value_form', 'value_season', 'web_name',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'influence_rank', 'influence_rank_type',
       'creativity_rank', 'creativity_rank_type', 'threat_rank',
       'threat_rank_type', 

In [46]:
#Create features for data analysis (player-opposition team, game week fixture difficulty index, player's club )
gw_away_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_a"])
gw_away_players['player_opp'] = gw_away_players['web_name'].map(str) + '-' + gw_away_players['home_team'].map(str)
gw_home_players = pd.merge(players_df_clean, GWfixtures_df, how="inner", left_on=["team"], right_on=["team_h"])
gw_home_players['player_opp'] = gw_home_players['web_name'].map(str) + '-' + gw_home_players['away_team'].map(str)
gw_away_players['diff'] = gw_away_players['away_team_strength'] - gw_away_players['home_team_strength']
gw_home_players['diff'] = gw_home_players['home_team_strength'] - gw_home_players['away_team_strength']
players_df_clean = gw_away_players.append(gw_home_players)
players_df_clean['player_club'] = players_df_clean['web_name'].map(str) + '-' + players_df_clean['club_name'].map(str)
players_df_clean.drop(['minutes_y'], axis=1, inplace=True)
players_df_clean.rename(columns = {'minutes_x':'minutes'}, inplace = True)
players_df_clean['ave_minutes'] = players_df_clean['minutes']/6