This file consists of python code to load data from the mysql database to 
pandas dataframes and perform operations to clean the data before passing it on to the feature extraction.

Installing Necessary packages

Importing Libraries and Establishing connection with the database

In [430]:
import pandas as pd
from sqlalchemy import create_engine, inspect
import os



Establishing connection with the database

In [431]:
# Database connection parameters
user = os.getenv('FPL_DB_USER')
password = os.getenv('FPL_DB_PASSWORD')
database = os.getenv('FPL_DB_NAME')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')

# Creating connection
#   Format: mysql+mysqlconnector://user:password@host:port/database
connection_string = f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}'

# Create engine
engine = create_engine(connection_string)


Importing MySQL Data to Python Dataframes

In [432]:
dfs={}

try:
    inspector = inspect(engine)
    # Get all table names from the database
    table_names = inspector.get_table_names()
    print(f"Found {len(table_names)} tables in the database.")
    print("==Tables in the database:==\n",
          table_names,
          "\n\nStarting to import...")

    for table in table_names:
        dfs[table] = pd.read_sql_table(table, engine)
        print(f"Table '{table}' imported with {len(dfs[table])} records.")
    
    print("All tables imported with data successfully.")

except Exception as e:
    print(f"Error occurred during import: {e}")
finally:
    engine.dispose()
    print("Database connection closed.")

Found 11 tables in the database.
==Tables in the database:==
 ['fact_player_gameweeks', 'fpl_fixtures', 'fpl_player_gameweeks', 'fpl_season_players', 'fpl_season_teams', 'player_history', 'players', 'positions', 'teams', 'understat_roster_metrics', 'understat_team_metrics'] 

Starting to import...
Table 'fact_player_gameweeks' imported with 17361 records.
Table 'fpl_fixtures' imported with 2660 records.
Table 'fpl_player_gameweeks' imported with 215399 records.
Table 'fpl_season_players' imported with 6517 records.
Table 'fpl_season_teams' imported with 120 records.
Table 'player_history' imported with 17361 records.
Table 'players' imported with 802 records.
Table 'positions' imported with 4 records.
Table 'teams' imported with 20 records.
Table 'understat_roster_metrics' imported with 96091 records.
Table 'understat_team_metrics' imported with 3420 records.
All tables imported with data successfully.
Database connection closed.


In [433]:
# Show all tables in dfs
print("Tables loaded into dataframes:", list(dfs.keys()))

Tables loaded into dataframes: ['fact_player_gameweeks', 'fpl_fixtures', 'fpl_player_gameweeks', 'fpl_season_players', 'fpl_season_teams', 'player_history', 'players', 'positions', 'teams', 'understat_roster_metrics', 'understat_team_metrics']


Moving data to individual variables

In [434]:
fact_player_gameweeks_df = dfs['fact_player_gameweeks']
fact_player_gameweeks_df.head()


Unnamed: 0,player_id,player_name,team_id,team_name,event,minutes,total_points,value,goals_scored,assists,clean_sheets,goals_conceded,yellow_cards,red_cards,own_goals,opponent_id,opponent_name,home_away
0,1,David Raya Martín,1,Arsenal,1.0,90.0,10.0,55.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,14.0,Man Utd,A
1,1,David Raya Martín,1,Arsenal,2.0,90.0,6.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,11.0,Leeds,H
2,1,David Raya Martín,1,Arsenal,3.0,90.0,2.0,55.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,Liverpool,A
3,1,David Raya Martín,1,Arsenal,4.0,90.0,6.0,55.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16.0,Nott'm Forest,H
4,1,David Raya Martín,1,Arsenal,5.0,90.0,2.0,55.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,13.0,Man City,H


In [435]:

fpl_fixtures_df = dfs['fpl_fixtures']
fpl_fixtures_df.head()


Unnamed: 0,season,fixture_id,event,team_h,team_a,team_h_score,team_a_score,finished,kickoff_time
0,2018-19,1,1,1,13,0,2,1,2018-08-12T15:00:00Z
1,2018-19,2,1,2,5,2,0,1,2018-08-11T14:00:00Z
2,2018-19,3,1,9,7,0,2,1,2018-08-11T14:00:00Z
3,2018-19,4,1,10,6,0,3,1,2018-08-11T14:00:00Z
4,2018-19,5,1,12,19,4,0,1,2018-08-12T12:30:00Z


In [436]:

fpl_season_players_df = dfs['fpl_season_players']
fpl_season_players_df.head()


Unnamed: 0,season,element_id,first_name,second_name,team_id,element_type,total_points,now_cost
0,2016-17,1,David,Ospina,1,1,2,47
1,2016-17,2,Petr,Cech,1,1,134,54
2,2016-17,3,Laurent,Koscielny,1,2,121,61
3,2016-17,4,Per,Mertesacker,1,2,1,48
4,2016-17,5,Gabriel Armando,de Abreu,1,2,45,48


In [437]:

fpl_season_teams_df = dfs['fpl_season_teams']
fpl_season_teams_df.head()


Unnamed: 0,season,team_id,team_name,short_name,strength,strength_overall_home,strength_overall_away,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away
0,2019-20,1,Arsenal,ARS,4,1180,1240,1170,1170,1150,1200
1,2019-20,2,Aston Villa,AVL,2,1020,1050,970,980,1000,1040
2,2019-20,3,Bournemouth,BOU,2,1020,1020,990,1030,1000,1050
3,2019-20,4,Brighton,BHA,2,1050,1010,1100,1100,1040,1030
4,2019-20,5,Burnley,BUR,3,1110,1180,1130,1070,970,1110


In [438]:
player_history_df = dfs.get('player_history')
player_history_df.head()

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,modified,...,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,value,transfers_balance,selected,transfers_in,transfers_out,player_id
0,1,9,14,10,0,2025-08-17T15:30:00Z,0.0,1.0,1,0,...,0.0,0.0,0.0,1.52,55,0,1531911,0,0,1
1,1,11,11,6,1,2025-08-23T16:30:00Z,5.0,0.0,2,0,...,0.0,0.0,0.0,0.17,55,218659,2284634,277339,58680,1
2,1,25,12,2,0,2025-08-31T15:30:00Z,1.0,0.0,3,0,...,0.0,0.02,0.02,0.52,55,-12311,2406964,146739,159050,1
3,1,31,16,6,1,2025-09-13T11:30:00Z,3.0,0.0,4,0,...,0.0,0.0,0.0,0.2,55,171289,2765759,289041,117752,1
4,1,41,13,2,1,2025-09-21T15:30:00Z,1.0,1.0,5,0,...,0.0,0.01,0.01,0.89,55,-9786,2762632,98100,107886,1


In [439]:
players_df = dfs.get('players')
players_df.head()

Unnamed: 0,can_transact,can_select,chance_of_playing_next_round,chance_of_playing_this_round,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,dreamteam_count,...,points_per_game_rank_type,selected_rank,selected_rank_type,starts_per_90,clean_sheets_per_90,defensive_contribution_per_90,scout_risks.0.property,scout_risks.0.notes,scout_risks.0.gameweek,scout_risks.0.url
0,1,1,,,154561,0,0,4,-4,1,...,4,6,1,1.0,0.5,0.0,,,,
1,1,1,,,109745,0,0,-4,4,0,...,81,271,37,0.0,0.0,0.0,,,,
2,1,0,0.0,0.0,463748,0,0,0,0,0,...,71,359,52,0.0,0.0,0.0,,,,
3,1,1,,,551221,0,0,-1,1,0,...,56,343,48,0.0,0.0,0.0,,,,
4,1,1,100.0,100.0,226597,0,0,9,-9,4,...,1,5,2,1.0,0.66,8.63,,,,


In [440]:
positions_df = dfs.get('positions')
positions_df.head()

Unnamed: 0,id,plural_name,plural_name_short,singular_name,singular_name_short,squad_select,squad_min_select,squad_max_select,squad_min_play,squad_max_play,ui_shirt_specific,sub_positions_locked.0,element_count
0,1,Goalkeepers,GKP,Goalkeeper,GKP,2,,,1,1,1,12.0,91
1,2,Defenders,DEF,Defender,DEF,5,,,3,5,0,,262
2,3,Midfielders,MID,Midfielder,MID,5,,,2,5,0,,361
3,4,Forwards,FWD,Forward,FWD,3,,,1,3,0,,88


In [441]:
teams_df = dfs.get('teams')
teams_df.head()

Unnamed: 0,code,draw,form,id,loss,name,played,points,position,short_name,...,team_division,unavailable,win,strength_overall_home,strength_overall_away,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away,pulse_id
0,3,0,,1,0,Arsenal,0,0,1,ARS,...,,0,0,1300,1375,1340,1400,1260,1350,1
1,7,0,,2,0,Aston Villa,0,0,3,AVL,...,,0,0,1145,1185,1150,1170,1140,1200,2
2,90,0,,3,0,Burnley,0,0,19,BUR,...,,0,0,1055,1095,1010,1090,1100,1100,43
3,91,0,,4,0,Bournemouth,0,0,13,BOU,...,,0,0,1150,1220,1100,1240,1200,1200,127
4,94,0,,5,0,Brentford,0,0,8,BRE,...,,0,0,1135,1175,1100,1110,1170,1240,130


In [442]:

understat_roster_metrics_df = dfs['understat_roster_metrics']
understat_roster_metrics_df.head()


Unnamed: 0,id,goals,own_goals,shots,xg,time,player_id,team_id,position,player,...,red_card,roster_in,roster_out,key_passes,assists,xa,xgchain,xgbuildup,positionorder,match_link
0,57081,0,0,0,0.0,90,491,83,GK,Petr Cech,...,0,0,0,0,0,0.0,0.431844,0.431844,1,https://understat.com/match/88
1,57082,0,0,1,0.028445,69,967,83,DR,Mathieu Debuchy,...,0,57092,0,1,0,0.017912,0.191671,0.163225,2,https://understat.com/match/88
2,57083,0,0,1,0.017912,90,494,83,DC,Laurent Koscielny,...,0,0,0,0,0,0.0,0.097953,0.097953,3,https://understat.com/match/88
3,57084,0,0,1,0.08039,90,507,83,DC,Per Mertesacker,...,0,0,0,0,0,0.0,0.231735,0.231735,3,https://understat.com/match/88
4,57085,0,0,0,0.0,90,495,83,DL,Nacho Monreal,...,0,0,0,0,0,0.0,0.613265,0.613265,4,https://understat.com/match/88


In [443]:

understat_team_metrics_df = dfs['understat_team_metrics']
understat_team_metrics_df.head()

Unnamed: 0,id,fid,h,a,date,league_id,season,h_goals,a_goals,team_h,...,h_l,league,h_shot,a_shot,h_shotontarget,a_shotontarget,h_deep,a_deep,a_ppda,h_ppda
0,81,958431,89,82,2015-08-08 15:45:00,1,2015,1,0,Manchester United,...,0.3236,EPL,9,9,1,4,4,10,8.2188,13.8261
1,82,958427,73,71,2015-08-08 18:00:00,1,2015,0,1,Bournemouth,...,0.2958,EPL,11,7,2,3,11,2,11.8462,6.9
2,83,958429,72,90,2015-08-08 18:00:00,1,2015,2,2,Everton,...,0.2675,EPL,10,11,5,5,5,4,17.1579,6.65
3,84,958430,75,77,2015-08-08 18:00:00,1,2015,4,2,Leicester,...,0.1521,EPL,19,11,8,5,5,6,9.5556,10.88
4,85,958433,79,78,2015-08-08 18:00:00,1,2015,1,3,Norwich,...,0.638,EPL,17,11,6,7,5,10,10.625,5.7368


In [444]:
# print("========\nfact_player_gameweeks_df\n",fact_player_gameweeks_df.info(),"\n\n")
# print("========\nfpl_fixtures_df\n",fpl_fixtures_df.info(),"\n\n")
# print("========\nfpl_season_players_df\n",fpl_season_players_df.info(),"\n\n")
# print("========\nfpl_season_teams_df\n",fpl_season_teams_df.info(),"\n\n")
# print("========\nplayer_history_df\n",player_history_df.info(),"\n\n")
# print("========\nplayers_df\n",players_df.info(),"\n\n")
# print("========\npositions_df\n",positions_df.info(),"\n\n")
# print("========\nteams_df\n",teams_df.info(),"\n\n")
# print("========\nunderstat_roster_metrics_df\n",understat_roster_metrics_df.info(),"\n\n")
# print("========\nunderstat_team_metrics_df\n",understat_team_metrics_df.info())

# Cleaning Positions Data

In [445]:
# Renaming columns for better clarity
positions_df = positions_df.rename(columns={
    'id':'position_id',
    'singular_name':'position_name',
    'singular_name_short':'position_short_name',
    'sqaud_select':'squad_capacity', # Total allowed in 15-man squad
    'squad_min_play':'min_starting_size',
    'sqaud_max_play':'max_starting_size',
    'ui_shirt_specific':'is_gk_shirt'  # To be converted to boolean
})

# Type conversions
positions_df['is_gk_shirt'] = positions_df['is_gk_shirt'].astype(bool)


#sorted(positions_df.columns)
# Reordering
# positions_df = positions_df[[
#     'position_id', 'position_name', 'position_short_name', 
#     'squad_capacity', 'min_starting_size', 'max_starting_size', 
#     'element_count', 'is_gk_shirt'
# ]]

# Cleaning fact_player_gameweeks_df
A fact table.
Each row in this table represents a unique event.

In [446]:
# Normalize values
fact_player_gameweeks_df['value'] = fact_player_gameweeks_df['value'] / 10.0

# Standardize column names across dataframes 
count_cols = [
    'event',
    'minutes',
    'total_points',
    'goals_scored', 
    'assists', 
    'clean_sheets', 
    'goals_conceded', 
    'yellow_cards', 
    'red_cards', 
    'own_goals', 
    'opponent_id'
]
fact_player_gameweeks_df[count_cols] = fact_player_gameweeks_df[count_cols].fillna(0).astype(int)

# Sorting by time
fact_player_gameweeks_df = fact_player_gameweeks_df.sort_values(by = ['player_id','event'])


# Encoding location
fact_player_gameweeks_df['is_home'] = fact_player_gameweeks_df['home_away'].apply(lambda x : 1 if x == 'H' else 0)

# Resetting index
#  After sorting the index becomes scrambled. It is reset to keep the dataframe clean
fact_player_gameweeks_df = fact_player_gameweeks_df.reset_index(drop=True)


# Cleaning fpl_fixtures_df

In [447]:
# Converring kickoff_time to datetime
fpl_fixtures_df['kickoff_time'] = pd.to_datetime(fpl_fixtures_df['kickoff_time'])

# Handling missing values
#  For modeling we often fill missing scores with -1 to differentiate from a 0-0 draw
fpl_fixtures_df['team_h_score'] = fpl_fixtures_df['team_h_score'].fillna(-1).astype(int)
fpl_fixtures_df['team_a_score'] = fpl_fixtures_df['team_a_score'].fillna(-1).astype(int)

# Extracting time features
fpl_fixtures_df['kickoff_hour'] = fpl_fixtures_df['kickoff_time'].dt.hour
fpl_fixtures_df['kickoff_dayofweek'] = fpl_fixtures_df['kickoff_time'].dt.day_name()

# Creating a Match Result column
# We get the match result to the model
# H  -> Home Win, A -> Away Win, D -> Draw, U -> Unplayed
def get_match_result(row):
    if not row['finished']:
        return 'U' 
    if row['team_h_score'] > row['team_a_score']:
        return 'H' 
    elif row['team_h_score'] < row['team_a_score']:
        return 'A' 
    else: return 'D' 

fpl_fixtures_df['result'] = fpl_fixtures_df.apply(get_match_result, axis=1)

# Consistency check (sorting)
fpl_fixtures_df = fpl_fixtures_df.sort_values(by=['event','kickoff_time']).reset_index(drop=True)

# Cleaning fpl_season_players_df

In [448]:
# Concatenating name
fpl_season_players_df['full_name'] = fpl_season_players_df['first_name'] + " " + fpl_season_players_df['second_name']

# Nomralizing price
fpl_season_players_df['now_cost'] = fpl_season_players_df['now_cost'] / 10.0

# Mapping positions
position_map = positions_df.set_index('position_id')['position_name'].to_dict()
fpl_season_players_df['position'] = fpl_season_players_df['element_type'].map(position_map)

# Extracting year
fpl_season_players_df['season_start'] = fpl_season_players_df['season'].str.split('-').str[0].astype(int)

# 
cols_to_keep = [
    'season', 
    'season_start', 
    'element_id', 
    'full_name', 
    'team_id', 
    'position', 
    'total_points', 
    'now_cost'
]

fpl_season_players_df = fpl_season_players_df[cols_to_keep]

# fpl_season_players_df.head()


# Cleaning fpl_season_teams_df

In [449]:
# Renaming for clarity
fpl_season_teams_df = fpl_season_teams_df.rename(columns={
    'strength': 'fpl_difficulty_rating'
})

# Aligning Season format
# To get numeric year
fpl_season_teams_df['season_start'] = fpl_season_teams_df['season'].str.split('-').str[0].astype(int)

# Calculating Home/Away advantage
# Some teams are stroger when they are home than away
# Useful for a 'home_advantage_bias'. Some people perform better at their home stadium
fpl_season_teams_df['home_advantage_bias'] = fpl_season_teams_df['strength_overall_home'] - fpl_season_teams_df['strength_overall_away']

# Data type conversion
strength_cols = [col for col in fpl_season_teams_df.columns if 'strength' in col]
fpl_season_teams_df[strength_cols] = fpl_season_teams_df[strength_cols].astype(int)

fpl_season_teams_df.head()

Unnamed: 0,season,team_id,team_name,short_name,fpl_difficulty_rating,strength_overall_home,strength_overall_away,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away,season_start,home_advantage_bias
0,2019-20,1,Arsenal,ARS,4,1180,1240,1170,1170,1150,1200,2019,-60
1,2019-20,2,Aston Villa,AVL,2,1020,1050,970,980,1000,1040,2019,-30
2,2019-20,3,Bournemouth,BOU,2,1020,1020,990,1030,1000,1050,2019,0
3,2019-20,4,Brighton,BHA,2,1050,1010,1100,1100,1040,1030,2019,40
4,2019-20,5,Burnley,BUR,3,1110,1180,1130,1070,970,1110,2019,-70


# Cleaning player_history_df
This table contans information about individual match performance of every player.

## Changes made
- Renaming columns




In [450]:
# Checking data types
# player_history_df.info()

# Checking columns and values.

# Find the columns with only one unique value. 
# These columns can be dropped as they do not provide any useful information.
constant_columns = [col for col in player_history_df.columns if player_history_df[col].nunique() <= 1]

# Checking ranges of data
# To confirm if we have full
# print(f"Data Range: {player_history_df['kickoff_time'].min()} to {player_history_df['kickoff_time'].max()}")


# Dropping Irrelevant columns
player_history_df = player_history_df.drop(columns=constant_columns)


# Renaming columns for better clarity
player_history_df = player_history_df.rename(columns={
    'team_h_score':'team_home_score',
    'team_a_score':'team_away_score',
    'bps':'bonus_points_system_score',
    'ict_index':'influence_creativity_threat_index'
})


# Type Conversion

# Converting 'kickoff_time' to datetime
player_history_df['kickoff_time'] = pd.to_datetime(player_history_df['kickoff_time'])

# Boolean conversion: Columns with only two values (0 and 1) can be converted to boolean type
# Converting `was_home`
player_history_df['was_home'] = player_history_df['was_home'].astype(bool)
# Converting `started`
player_history_df['starts'] = player_history_df['starts'].astype(bool)


# list all columns in ascending order
sorted(player_history_df.columns)


# Getting which team won


def get_player_team_score(row):
    return row['team_home_score'] if row['was_home'] else row['team_away_score']

def get_opponent_team_score(row):
    return row['team_away_score'] if row['was_home'] else row['team_home_score']

# Create result column (W/D/L) based on team and opponent scores
def get_match_result(row):
    player_score = get_player_team_score(row)
    opponent_score = get_opponent_team_score(row)

    if player_score > opponent_score:
        return 'W'  # Win
    elif player_score < opponent_score:
        return 'L'  # Loss
    else:
        return 'D'  # Draw

player_history_df['match_result'] = player_history_df.apply(get_match_result, axis=1)

# player_history_df.info()
player_history_df.head()

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_home_score,team_away_score,round,minutes,...,expected_assists,expected_goal_involvements,expected_goals_conceded,value,transfers_balance,selected,transfers_in,transfers_out,player_id,match_result
0,1,9,14,10,False,2025-08-17 15:30:00+00:00,0.0,1.0,1,90,...,0.0,0.0,1.52,55,0,1531911,0,0,1,W
1,1,11,11,6,True,2025-08-23 16:30:00+00:00,5.0,0.0,2,90,...,0.0,0.0,0.17,55,218659,2284634,277339,58680,1,W
2,1,25,12,2,False,2025-08-31 15:30:00+00:00,1.0,0.0,3,90,...,0.02,0.02,0.52,55,-12311,2406964,146739,159050,1,L
3,1,31,16,6,True,2025-09-13 11:30:00+00:00,3.0,0.0,4,90,...,0.0,0.0,0.2,55,171289,2765759,289041,117752,1,W
4,1,41,13,2,True,2025-09-21 15:30:00+00:00,1.0,1.0,5,90,...,0.01,0.01,0.89,55,-9786,2762632,98100,107886,1,D


# Cleaning player_df

Contains the bio, current statues, prices and performance of each player.
Many columns present.

In [451]:
sorted(players_df.columns)

['assists',
 'birth_date',
 'bonus',
 'bps',
 'can_select',
 'can_transact',
 'chance_of_playing_next_round',
 'chance_of_playing_this_round',
 'clean_sheets',
 'clean_sheets_per_90',
 'clearances_blocks_interceptions',
 'code',
 'corners_and_indirect_freekicks_order',
 'corners_and_indirect_freekicks_text',
 'cost_change_event',
 'cost_change_event_fall',
 'cost_change_start',
 'cost_change_start_fall',
 'creativity',
 'creativity_rank',
 'creativity_rank_type',
 'defensive_contribution',
 'defensive_contribution_per_90',
 'direct_freekicks_order',
 'direct_freekicks_text',
 'dreamteam_count',
 'element_type',
 'ep_next',
 'ep_this',
 'event_points',
 'expected_assists',
 'expected_assists_per_90',
 'expected_goal_involvements',
 'expected_goal_involvements_per_90',
 'expected_goals',
 'expected_goals_conceded',
 'expected_goals_conceded_per_90',
 'expected_goals_per_90',
 'first_name',
 'form',
 'form_rank',
 'form_rank_type',
 'goals_conceded',
 'goals_conceded_per_90',
 'goals_scor

In [453]:
# Renaming columns for better clarity
players_df = players_df.rename(columns={
    'now_cost': 'price',
    'element_type': 'position_id', # This is to be mapped
    'team': 'team_id',
    'web_name': 'player_name'
})

# Data type conversion and normalization
#  `price`
players_df['price'] = players_df['price'] / 10.0  # Convert to actual price in millions
# `selected_by_percent`
players_df['selected_by_percent'] = players_df['selected_by_percent'].astype(float)

# Mapping IDs to Descriptive Names
#  Use mapping tables to replace numeric IDs with actual team and position names
position_map = positions_df.set_index('position_id')['position_name'].to_dict()
# print(position_map)
players_df['position'] = players_df['position_id'].map(position_map)

team_map = teams_df.set_index('id')['name'].to_dict()
# print(team_map)
players_df['team_name'] = players_df['team_id'].map(team_map)


# Feature Selection
#  Select only relavent columns from the player_df
columns_to_keep = [
    'player_name', 
    'team_name',
    'position',
    'price', 
    'total_points', 
    'points_per_game',
    'form', 
    'status', 
    'chance_of_playing_next_round', 
    'selected_by_percent',
    'minutes', 
    'goals_scored', 
    'assists', 
    'clean_sheets', 
    'bonus', 
    'bps',
    'ict_index', 'expected_goals', 
    'expected_assists', 
    'expected_goal_involvements', 
    'expected_goals_conceded']

players_df = players_df[columns_to_keep].copy()

# Cleaning teams_df

In [454]:
# Renaming columns
teams_df = teams_df.rename(columns={
    'id':'team_id',
    'name':'team_name',
    'short_name':'team_short_name',
    'strength':'overall_strength_rating'
})

# Add a full_name column by mapping the team id with a string
team_full_name_map = {
    1: 'Arsenal',
    2: 'Aston Villa',
    3: 'Bournemouth',
    4: 'Brentford',
    5: 'Brighton & Hove Albion',
    6: 'Burnley',
    7: 'Chelsea',
    8: 'Crystal Palace',
    9: 'Everton',
    10: 'Fulham',
    11: 'Leeds United',
    12: 'Leicester City',
    13: 'Liverpool',
    14: 'Manchester City',
    15: 'Manchester United',
    16: 'Newcastle United',
    17: 'Nottingham Forest',
    18: 'Southampton',
    19: 'Tottenham Hotspur',
    20: 'West Ham United',
    21: 'Wolverhampton Wanderers'
}

teams_df['team_full_name'] = teams_df['team_id'].map(team_full_name_map)

# Selecting features
teams_df = teams_df[[
    'team_id', 
    'team_name', 
    'team_short_name', 
    'team_full_name',
    'overall_strength_rating',
    'strength_overall_home', 
    'strength_overall_away',
    'strength_attack_home', 
    'strength_attack_away',
    'strength_defence_home', 
    'strength_defence_away'
]]


# Cleaning understat_roster_metrics_df

In [455]:
# Extracting match Id
understat_roster_metrics_df['understat_match'] = understat_roster_metrics_df['match_link'].str.extract(r'(\d+)$').astype(int) 

# Renaming
understat_roster_metrics_df = understat_roster_metrics_df.rename(columns={
    'time':'minutes_played',
    'xg':'expected_goals',
    'xa':'expected_assists',
    'h_a':'location'
})

# Location encoding
understat_roster_metrics_df['is_home'] = understat_roster_metrics_df['location'].apply(lambda x: 1 if x == 'h' else 0)

# Metrics rounding
# 2 Decimal places is standard
adv_cols = [
    'expected_goals', 
    'expected_assists', 
    'xgchain', 
    'xgbuildup'
    ]
understat_roster_metrics_df[adv_cols] = understat_roster_metrics_df[adv_cols].round(2)

# Handling 'Sub' Positions
# Understat marks players as 'Sub' if they came on. However, we often
# want to know their ACTUAL position. We'll keep it for now but note 
# it's a "Role" rather than a "Position" in some cases.
understat_roster_metrics_df['is_starter'] = understat_roster_metrics_df['position'].apply(lambda x: 0 if x == 'Sub' else 1)

# Dropping data that is no longer needed
understat_roster_metrics_df = understat_roster_metrics_df.drop(columns=['match_link','id','location'])

understat_roster_metrics_df.head(10)

Unnamed: 0,goals,own_goals,shots,expected_goals,minutes_played,player_id,team_id,position,player,yellow_card,...,roster_out,key_passes,assists,expected_assists,xgchain,xgbuildup,positionorder,understat_match,is_home,is_starter
0,0,0,0,0.0,90,491,83,GK,Petr Cech,0,...,0,0,0,0.0,0.43,0.43,1,88,1,1
1,0,0,1,0.03,69,967,83,DR,Mathieu Debuchy,0,...,0,1,0,0.02,0.19,0.16,2,88,1,1
2,0,0,1,0.02,90,494,83,DC,Laurent Koscielny,0,...,0,0,0,0.0,0.1,0.1,3,88,1,1
3,0,0,1,0.08,90,507,83,DC,Per Mertesacker,0,...,0,0,0,0.0,0.23,0.23,3,88,1,1
4,0,0,0,0.0,90,495,83,DL,Nacho Monreal,1,...,0,0,0,0.0,0.61,0.61,4,88,1,1
5,0,0,0,0.0,60,497,83,DMC,Francis Coquelin,0,...,0,1,0,0.07,0.13,0.06,7,88,1,1
6,0,0,2,0.04,90,504,83,DMC,Aaron Ramsey,0,...,0,2,0,0.12,0.25,0.09,7,88,1,1
7,0,0,3,0.11,90,966,83,AMR,Alex Oxlade-Chamberlain,0,...,0,4,0,0.4,0.72,0.2,11,88,1,1
8,0,0,2,0.15,90,499,83,AMC,Mesut Özil,0,...,0,4,0,0.34,0.43,0.09,12,88,1,1
9,0,0,2,0.05,90,965,83,AML,Santiago Cazorla,0,...,0,5,0,0.17,0.76,0.64,13,88,1,1


# Cleaning understat_team_metrics_df

In [456]:
fpl_teams_df = dfs['fpl_season_teams']
fpl_team_list = sorted(fpl_teams_df['team_name'].unique())

fpl_team_list

['Arsenal',
 'Aston Villa',
 'Bournemouth',
 'Brentford',
 'Brighton',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Ipswich',
 'Leeds',
 'Leicester',
 'Liverpool',
 'Luton',
 'Man City',
 'Man Utd',
 'Newcastle',
 'Norwich',
 "Nott'm Forest",
 'Sheffield Utd',
 'Southampton',
 'Spurs',
 'Watford',
 'West Brom',
 'West Ham',
 'Wolves']

In [457]:

# Load files
fpl_teams = dfs['fpl_season_teams']
cs_teams = dfs['teams']

fpl_teams = fpl_teams[['team_id', 'team_name', 'short_name']].drop_duplicates()

cs_teams = cs_teams[['id','name','short_name']].drop_duplicates()
cs_teams = cs_teams.rename(columns={
    'id':'team_id',
    'name':'team_name'
})


primary_ids = cs_teams['team_id'].unique()
additional_teams = fpl_teams[~fpl_teams['team_id'].isin(primary_ids)]
name_map = pd.concat([cs_teams, additional_teams], ignore_index=True)


print(name_map)

    team_id       team_name short_name
0         1         Arsenal        ARS
1         2     Aston Villa        AVL
2         3         Burnley        BUR
3         4     Bournemouth        BOU
4         5       Brentford        BRE
5         6        Brighton        BHA
6         7         Chelsea        CHE
7         8  Crystal Palace        CRY
8         9         Everton        EVE
9        10          Fulham        FUL
10       11           Leeds        LEE
11       12       Liverpool        LIV
12       13        Man City        MCI
13       14         Man Utd        MUN
14       15       Newcastle        NEW
15       16   Nott'm Forest        NFO
16       17      Sunderland        SUN
17       18           Spurs        TOT
18       19        West Ham        WHU
19       20          Wolves        WOL


In [458]:
# Standardize your existing name_map
fpl_teams = dfs['fpl_season_teams'][['team_id', 'team_name', 'short_name']].drop_duplicates()
cs_teams = dfs['teams'][['id','name','short_name']].drop_duplicates().rename(columns={'id':'team_id','name':'team_name'})

primary_ids = cs_teams['team_id'].unique()
additional_teams = fpl_teams[~fpl_teams['team_id'].isin(primary_ids)]
name_map = pd.concat([cs_teams, additional_teams], ignore_index=True)

# Understat Team Metrics Dataframe
understat_team_metrics_df = dfs['understat_team_metrics']

# Create a dictionary to fix Understat name discrepancies
# This ensures "Manchester City" matches "Man City" in your name_map
name_corrections = {
    'Manchester City': 'Man City',
    'Manchester United': 'Man Utd',
    'Tottenham': 'Spurs',
    'Wolverhampton Wanderers': 'Wolves',
    'Nottingham Forest': "Nott'm Forest",
    'Newcastle United': 'Newcastle',
    'Sheffield United': 'Sheffield Utd',
    'West Bromwich Albion': 'West Brom'
}

understat_team_metrics_df['team_h_clean'] = understat_team_metrics_df['team_h'].replace(name_corrections)
understat_team_metrics_df['team_a_clean'] = understat_team_metrics_df['team_a'].replace(name_corrections)

# Map the Unique Team IDs to the metrics table
# Join for Home Team
understat_team_metrics_df = understat_team_metrics_df.merge(name_map[['team_id', 'team_name']], left_on='team_h_clean', right_on='team_name', how='left')
understat_team_metrics_df = understat_team_metrics_df.rename(columns={'team_id': 'team_h_id'}).drop(columns=['team_name'])

# Join for Away Team
understat_team_metrics_df = understat_team_metrics_df.merge(name_map[['team_id', 'team_name']], left_on='team_a_clean', right_on='team_name', how='left')
understat_team_metrics_df = understat_team_metrics_df.rename(columns={'team_id': 'team_a_id'}).drop(columns=['team_name'])

# Final Cleaning: Drop helper columns and Understat's original inconsistent IDs
understat_team_metrics_df['date'] = pd.to_datetime(understat_team_metrics_df['date'])
understat_team_metrics_df = understat_team_metrics_df.drop(columns=['h', 'a', 'team_h_clean', 'team_a_clean'])

# Sort by date for chronological analysis
understat_team_metrics_df = understat_team_metrics_df.sort_values('date').reset_index(drop=True)

understat_team_metrics_df[['date', 'team_h_id', 'team_h', 'team_a_id', 'team_a', 'h_xg', 'a_xg']].head()
understat_team_metrics_df.head(20)

Unnamed: 0,id,fid,date,league_id,season,h_goals,a_goals,team_h,team_a,h_xg,...,h_shot,a_shot,h_shotontarget,a_shotontarget,h_deep,a_deep,a_ppda,h_ppda,team_h_id,team_a_id
0,81,958431,2015-08-08 15:45:00,1,2015,1,0,Manchester United,Tottenham,0.627539,...,9,9,1,4,4,10,8.2188,13.8261,14.0,18.0
1,82,958427,2015-08-08 18:00:00,1,2015,0,1,Bournemouth,Aston Villa,0.876106,...,11,7,2,3,11,2,11.8462,6.9,4.0,2.0
2,83,958429,2015-08-08 18:00:00,1,2015,2,2,Everton,Watford,0.604226,...,10,11,5,5,5,4,17.1579,6.65,9.0,
3,84,958430,2015-08-08 18:00:00,1,2015,4,2,Leicester,Sunderland,2.56803,...,19,11,8,5,5,6,9.5556,10.88,,17.0
4,85,958433,2015-08-08 18:00:00,1,2015,1,3,Norwich,Crystal Palace,1.13076,...,17,11,6,7,5,10,10.625,5.7368,,8.0
5,86,958428,2015-08-08 20:30:00,1,2015,2,2,Chelsea,Swansea,0.64396,...,11,17,3,10,10,5,8.8333,10.3636,7.0,
6,87,958432,2015-08-09 16:30:00,1,2015,2,2,Newcastle United,Southampton,1.54613,...,9,14,4,4,3,5,7.0,12.7222,15.0,
7,88,958426,2015-08-09 16:30:00,1,2015,0,2,Arsenal,West Ham,1.33166,...,22,8,6,4,11,0,12.4545,8.1667,1.0,19.0
8,89,958434,2015-08-09 19:00:00,1,2015,0,1,Stoke,Liverpool,0.381274,...,7,8,1,3,2,5,9.3462,11.913,,12.0
9,90,958435,2015-08-10 23:00:00,1,2015,0,3,West Bromwich Albion,Manchester City,0.435238,...,9,20,2,7,4,8,8.087,23.2941,,13.0


In [459]:
# 
cleaned_dfs = [
    fact_player_gameweeks_df, 
    fpl_fixtures_df, 
    fpl_season_players_df, 
    fpl_season_teams_df, 
    player_history_df, 
    players_df, 
    positions_df, 
    teams_df, 
    understat_roster_metrics_df, 
    understat_team_metrics_df
]

%store cleaned_dfs

Stored 'cleaned_dfs' (list)
