From the fbref class, we can grab a season's worth of team data as seen on the fbref website. The team data we can extract can be split into 7 categories:
* fixtures data
* league table
* attacking data
* defense data
* passing data
* goalkeeping data
* playing time data

This notebook will look to create a cleaning function for each category as mentioned above to help with further analysis later.

# Imports

In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd
from src.fbref.fbref_class import FBref
from src.etl.clean import (
    clean_fixtures_df,
    clean_league_table_df,
    clean_attacking_table_df,
    clean_defense_table_df,
    clean_passing_table_df,
    clean_goalkeeping_table_df,
    clean_playing_time_table_df,
)
pd.set_option('display.max_columns', None)

# Instantiate FBref class

In [3]:
fb = FBref()

# Config

In [4]:
season_name_list = ["2022_2023"]
league_id = 9
league_name = "Premier-League"

# Grab data for an example season

In [5]:
seasons_dict = fb.get_seasons_dict(season_name_list, league_id, league_name)

season_dict = seasons_dict[season_name_list[0]]

Grab dataframes to be cleaned

In [6]:
fixtures_df = season_dict['fixtures']

league_table_df = season_dict['data']['league_table']

attacking_df = season_dict['data']['team_data']['attacking']
defense_df = season_dict['data']['team_data']['defense']
passing_df = season_dict['data']['team_data']['passing']
goalkeeping_df = season_dict['data']['team_data']['goalkeeping']
playing_time_df = season_dict['data']['team_data']['playing_time']

# Clean fixtures table

In [7]:
cleaned_fixtures_df = clean_fixtures_df(fixtures_df)
cleaned_fixtures_df.head(3)

Unnamed: 0,week,dow,kickoff,home_team,home_score,away_score,xG_home,xG_away,away_team,attendance,referee,notes
0,1.0,Fri,2022-08-05 20:00:00,Crystal Palace,0.0,2.0,1.2,1.0,Arsenal,25286.0,Anthony Taylor,
1,1.0,Sat,2022-08-06 12:30:00,Fulham,2.0,2.0,1.2,1.2,Liverpool,22207.0,Andy Madley,
2,1.0,Sat,2022-08-06 15:00:00,Tottenham,4.0,1.0,1.5,0.5,Southampton,61732.0,Andre Marriner,


# Clean league table

In [8]:
cleaned_league_table_df = clean_league_table_df(league_table_df)
cleaned_league_table_df.head(3)

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,Home_MP,Home_W,Home_D,Home_L,Home_GF,Home_GA,Home_GD,Home_Pts,Home_Pts_Per_MP,Away_MP,Away_W,Away_D,Away_L,Away_GF,Away_GA,Away_GD,Away_Pts,Away_Pts_Per_MP,win_perc,draw_perc,loss_perc,home_win_perc,home_draw_perc,home_loss_perc,away_win_perc,away_draw_perc,away_loss_perc,goals_per_game,goals_against_per_game,home_goals_per_game,home_goals_against_per_game,away_goals_per_game,away_goals_against_per_game
0,1,Arsenal,13,11,1,1,31,11,20,34,6,6,0,0,19,7,12,18,3.0,7,5,1,1,12,4,8,16,2.29,0.846,0.077,0.077,1.0,0.0,0.0,0.714,0.143,0.143,2.385,0.846,3.167,1.167,1.714,0.571
1,2,Manchester City,13,10,2,1,39,12,27,32,7,7,0,0,29,7,22,21,3.0,6,3,2,1,10,5,5,11,1.83,0.769,0.154,0.077,1.0,0.0,0.0,0.5,0.333,0.167,3.0,0.923,4.143,1.0,1.667,0.833
2,3,Newcastle Utd,14,7,6,1,28,11,17,27,7,4,3,0,16,5,11,15,2.14,7,3,3,1,12,6,6,12,1.71,0.5,0.429,0.071,0.571,0.429,0.0,0.429,0.429,0.143,2.0,0.786,2.286,0.714,1.714,0.857


# Clean team data

### Clean attacking table

In [9]:
cleaned_attacking_df = clean_attacking_table_df(attacking_df)
cleaned_attacking_df.head(3)

Unnamed: 0,Squad,no_of_players_used,Age,possession,MP,total_goals,total_assists,total_non_penalty_goals,total_penalties,total_penalty_attempted,total_goals_per_90,total_assist_per_90,total_non_penalty_goals_per_90,total_xg,total_npxg,total_expected_assisted_goals,total_xg_per_90,total_expected_assisted_goals_per_90,total_npxg_per_90,total_shots,total_shots_on_target,shot_on_target_perc,total_shots_per_90,total_shots_on_target_per_90,goal_per_shot,goal_per_shot_on_target,average_distance_per_shot,total_shots_free_kicks,total_penalties_scored,total_penalties_attempted,npxG_per_Sh,total_goals_minus_xg,total_npg_minus_npxg,total_sca,total_sca_per_90,total_live_ball_passes_to_shots,total_dead_ball_passes_to_shots,total_successful_dribbles_to_shots,total_shots_to_shots,total_fouls_drawn_to_shots,total_defensive_actions_to_shots,total_gca,total_gca_per_90,total_live_ball_passes_to_goals,total_dead_ball_passes_to_goals,total_successful_dribbles_to_goals,total_shots_to_goals,total_fouls_drawn_to_goals,total_defensive_actions_to_goals,goal_to_assist_ratio,total_goals_per_match,total_assists_per_match,total_non_penalty_goals_per_match,total_penalties_per_match,total_penalty_attempted_per_match,total_xg_per_match,total_npxg_per_match,total_expected_assisted_goals_per_match,Expected_npxG_plus_xAG_per_match,total_shots_per_match,total_shots_on_target_per_match,total_shots_free_kicks_per_match,total_penalties_scored_per_match,total_penalties_attempted_per_match,total_goals_minus_xg_per_match,total_npg_minus_npxg_per_match,total_sca_per_match,total_live_ball_passes_to_shots_per_match,total_dead_ball_passes_to_shots_per_match,total_successful_dribbles_to_shots_per_match,total_shots_to_shots_per_match,total_fouls_drawn_to_shots_per_match,total_defensive_actions_to_shots_per_match,total_gca_per_match,total_live_ball_passes_to_goals_per_match,total_dead_ball_passes_to_goals_per_match,total_successful_dribbles_to_goals_per_match,total_shots_to_goals_per_match
0,Arsenal,23,24.7,57.5,13,30,23,29,1,1,2.31,1.77,2.23,24.1,23.6,16.9,1.85,1.3,1.82,207,74,35.7,15.92,5.69,0.14,0.39,16.1,7,1,1,0.12,5.9,5.4,364,28.0,263,31,18,27,15,10,54,4.15,44,4,0,5,1,0,1.304348,2.308,1.769,2.231,0.077,0.077,1.854,1.815,1.3,3.115,15.923,5.692,0.538,0.077,0.077,0.454,0.415,28.0,20.231,2.385,1.385,2.077,1.154,0.769,4.154,3.385,0.308,0.0,0.385
1,Aston Villa,23,27.4,48.4,14,14,8,13,1,1,1.0,0.57,0.93,15.6,15.0,11.1,1.12,0.79,1.07,155,55,35.5,11.07,3.93,0.08,0.24,18.4,6,1,1,0.1,-1.6,-2.0,274,19.57,207,15,14,16,12,10,23,1.64,16,0,1,4,2,0,1.75,1.0,0.571,0.929,0.071,0.071,1.114,1.071,0.793,1.857,11.071,3.929,0.429,0.071,0.071,-0.114,-0.143,19.571,14.786,1.071,1.0,1.143,0.857,0.714,1.643,1.143,0.0,0.071,0.286
2,Bournemouth,25,26.8,38.6,14,15,12,15,0,0,1.07,0.86,1.07,9.2,9.2,7.4,0.66,0.53,0.66,106,40,37.7,7.57,2.86,0.14,0.38,16.0,4,0,0,0.09,5.8,5.8,185,13.21,134,18,11,10,7,5,27,1.93,21,2,0,3,0,1,1.25,1.071,0.857,1.071,0.0,0.0,0.657,0.657,0.529,1.186,7.571,2.857,0.286,0.0,0.0,0.414,0.414,13.214,9.571,1.286,0.786,0.714,0.5,0.357,1.929,1.5,0.143,0.0,0.214


### Clean defense table

In [10]:
cleaned_defense_df = clean_defense_table_df(defense_df)
cleaned_defense_df.head(3)

Unnamed: 0,Squad,no_of_players_used,MP,total_tackles,total_tackles_won,total_tackles_won_d3,total_tackles_won_m3,total_tackles_won_a3,total_dribbles_tackled,total_dribbles_faced,dribbles_tackled_perc,total_dribbles_past,total_blocks,total_shots_blocked,total_shots_passed,total_interceptions,total_tackles_interceptions,total_clearances,total_errors_to_opp_shots,total_yellow_cards,total_red_cards,total_second_yellows,total_fouls_committed,total_fouls_drawn,total_offsides,total_crosses,total_penalties_won,total_penalties_conceded,total_own_goals,total_loose_balls_recovered,total_aerial_duels_won,total_aerial_duels_lost,Aerial_Duels_Won_perc,total_tackles_per_match,total_tackles_won_per_match,total_tackles_won_d3_per_match,total_tackles_won_m3_per_match,total_tackles_won_a3_per_match,total_dribbles_tackled_per_match,total_dribbles_faced_per_match,total_dribbles_past_per_match,total_blocks_per_match,total_shots_blocked_per_match,total_shots_passed_per_match,total_interceptions_per_match,total_tackles_interceptions_per_match,total_clearances_per_match,total_yellow_cards_per_match,total_red_cards_per_match,total_second_yellows_per_match,total_fouls_committed_per_match,total_fouls_drawn_per_match,total_offsides_per_match,total_crosses_per_match,total_penalties_won_per_match,total_penalties_conceded_per_match,total_own_goals_per_match,total_loose_balls_recovered_per_match,total_aerial_duels_won_per_match,total_aerial_duels_lost_per_match
0,Arsenal,23,13.0,196,123,90,65,41,83,148,56.1,65,123,30,93,101,297,228,9,21,0,0,133,150,18,207,1,2,1,690,145,170,46.0,15.077,9.462,6.923,5.0,3.154,6.385,11.385,5.0,9.462,2.308,7.154,7.769,22.846,17.538,1.615,0.0,0.0,10.231,11.538,1.385,15.923,0.077,0.154,0.077,53.077,11.154,13.077
1,Aston Villa,23,14.0,264,137,131,104,29,119,212,56.1,93,177,46,131,133,397,215,3,30,1,0,168,166,30,248,1,3,3,711,196,203,49.1,18.857,9.786,9.357,7.429,2.071,8.5,15.143,6.643,12.643,3.286,9.357,9.5,28.357,15.357,2.143,0.071,0.0,12.0,11.857,2.143,17.714,0.071,0.214,0.214,50.786,14.0,14.5
2,Bournemouth,25,14.0,234,146,119,94,21,91,191,47.6,100,171,68,103,108,342,387,1,22,0,0,146,151,11,177,0,5,2,677,179,204,46.7,16.714,10.429,8.5,6.714,1.5,6.5,13.643,7.143,12.214,4.857,7.357,7.714,24.429,27.643,1.571,0.0,0.0,10.429,10.786,0.786,12.643,0.0,0.357,0.143,48.357,12.786,14.571


### Clean passing table

In [11]:
cleaned_passing_df = clean_passing_table_df(passing_df)
cleaned_passing_df.head(3)

Unnamed: 0,Squad,no_of_players_used,MP,total_passes_completed,total_passes_attempted,pass_completion_perc,distance_covered,progressive_distance,total_short_passes_completed,total_short_passes_attempted,short_pass_completion_perc,total_medium_passes_completed,total_medium_passes_attempted,medium_pass_completion_perc,total_long_passes_completed,total_long_passes_attempted,long_pass_completion_perc,total_key_passes,total_completed_final_third_passes,total_completed_18_yard_box_passes,total_completed_18_yard_box_crosses,total_progressive_passes,total_live_ball_passes,total_dead_ball_passes,free_kick_passes_attempted,defence_to_open_space_passes_completed,passes_40_yards_width,total_crosses,total_throw_in_taken,total_corners,total_inswinging_corners,total_outswinging_corners,total_straight_corners,total_passes_offsides,total_passes_blocked,possession,total_touches,defensive_penalty_area_touches,defensive_third_touches,middle_third_touches,attacking_third_touches,attacking_penalty_area_touches,total_live_ball_touches,total_dribbles_completed,total_dribble_attempted,dribbles_success_perc,dribbles_unsuccessful_dispossession,dribbles_dispossesed,total_passes_recieved,total_progressive_passes_recieved,total_passes_completed_per_match,total_passes_attempted_per_match,distance_covered_per_match,progressive_distance_per_match,total_short_passes_completed_per_match,total_short_passes_attempted_per_match,total_medium_passes_completed_per_match,total_medium_passes_attempted_per_match,total_long_passes_completed_per_match,total_long_passes_attempted_per_match,total_key_passes_per_match,total_completed_final_third_passes_per_match,total_completed_18_yard_box_passes_per_match,total_completed_18_yard_box_crosses_per_match,total_progressive_passes_per_match,total_live_ball_passes_per_match,total_dead_ball_passes_per_match,free_kick_passes_attempted_per_match,defence_to_open_space_passes_completed_per_match,passes_40_yards_width_per_match,total_crosses_per_match,total_throw_in_taken_per_match,total_corners_per_match,total_inswinging_corners_per_match,total_outswinging_corners_per_match,total_straight_corners_per_match,total_passes_offsides_per_match,total_passes_blocked_per_match,total_touches_per_match,defensive_penalty_area_touches_per_match,defensive_third_touches_per_match,middle_third_touches_per_match,attacking_third_touches_per_match,attacking_penalty_area_touches_per_match,total_live_ball_touches_per_match,total_dribbles_completed_per_match,total_dribble_attempted_per_match,dribbles_success_perc_per_match,dribbles_unsuccessful_dispossession_per_match,dribbles_dispossesed_per_match,total_passes_recieved_per_match,total_progressive_passes_recieved_per_match
0,Arsenal,23,13.0,6002,7207,83.3,103958,35640,2767,3059,90.5,2600,2931,88.7,507,882,57.5,155,521,144,23,456,6631,558,182,14,35,207,220,73,53,4,0,18,111,57.5,8602,658,2225,3914,2534,434,8601,118,248,47.6,204,155,5906,437,461.692,554.385,7996.769,2741.538,212.846,235.308,200.0,225.462,39.0,67.846,11.923,40.077,11.077,1.769,35.077,510.077,42.923,14.0,1.077,2.692,15.923,16.923,5.615,4.077,0.308,0.0,1.385,8.538,661.692,50.615,171.154,301.077,194.923,33.385,661.615,9.077,19.077,3.662,15.692,11.923,454.308,33.615
1,Aston Villa,23,14.0,5066,6501,77.9,89985,33170,2271,2600,87.3,2128,2509,84.8,527,1027,51.3,109,403,94,31,343,5831,640,179,15,38,248,274,61,36,10,2,30,117,48.4,7945,896,2592,3567,1860,255,7943,72,192,37.5,216,127,4983,335,361.857,464.357,6427.5,2369.286,162.214,185.714,152.0,179.214,37.643,73.357,7.786,28.786,6.714,2.214,24.5,416.5,45.714,12.786,1.071,2.714,17.714,19.571,4.357,2.571,0.714,0.143,2.143,8.357,567.5,64.0,185.143,254.786,132.857,18.214,567.357,5.143,13.714,2.679,15.429,9.071,355.929,23.929
2,Bournemouth,25,14.0,4176,5496,76.0,74911,30456,1873,2174,86.2,1739,2069,84.1,440,895,49.2,84,282,74,23,234,4820,665,175,9,28,177,263,51,31,10,3,11,130,38.6,7029,1074,2986,2794,1353,207,7029,87,218,39.9,206,142,4120,226,298.286,392.571,5350.786,2175.429,133.786,155.286,124.214,147.786,31.429,63.929,6.0,20.143,5.286,1.643,16.714,344.286,47.5,12.5,0.643,2.0,12.643,18.786,3.643,2.214,0.714,0.214,0.786,9.286,502.071,76.714,213.286,199.571,96.643,14.786,502.071,6.214,15.571,2.85,14.714,10.143,294.286,16.143


### Clean goalkeeping table

In [12]:
cleaned_goalkeeping_df = clean_goalkeeping_table_df(goalkeeping_df)
cleaned_goalkeeping_df.head(3)

Unnamed: 0,Squad,no_of_players_used,MP,total_goals_against,total_goals_against_per_90,total_shots_on_target_against,total_saves,save_perc,total_clean_sheets,clean_sheet_perc,total_penalties_faced,total_penalties_allowed,total_penalties_saved,total_penalties_missed,penalty_save_perc,total_goals_against_penalties,total_goals_against_free_kicks,total_goals_against_corners,total_goals_against_own_goals,total_post_shot_xg,total_post_shot_xg_per_shot_on_target,post_shot_xg_minus_goals_against,post_shot_xg_minus_goals_against_per_90,total_40_yard_passes_completed,total_40_yard_passes_attempted,total_40_yard_passes_completion_perc,total_passes_attempted_excl_goal_kick,total_throws_attempted,total_passes_lauches_perc_excl_goal_kick,average_pass_length_excl_goal_kick,goal_kicks_attempted,goal_kicks_launched_perc,average_goal_kicks_length,opponent_attempted_crosses_into_penalty_area,opponent_crosses_into_penalty_area_stopped,opponent_crosses_into_penalty_area_stopped_perc,defensive_actions_outside_penalty_area,defensive_actions_outside_penalty_area_per_90,average_distance_from_goal_for_defensive_actions,total_shots_on_target_against_per_match,total_saves_per_match,total_goals_against_per_match,total_goals_against_penalties_per_match,total_goals_against_free_kicks_per_match,total_goals_against_corners_per_match,total_goals_against_own_goals_per_match,total_post_shot_xg_per_match,total_40_yard_passes_completed_per_match,total_40_yard_passes_attempted_per_match,total_passes_attempted_excl_goal_kick_per_match,total_throws_attempted_per_match,total_passes_lauches_perc_excl_goal_kick_per_match,average_pass_length_excl_goal_kick_per_match,goal_kicks_attempted_per_match,opponent_attempted_crosses_into_penalty_area_per_match,opponent_crosses_into_penalty_area_stopped_per_match,defensive_actions_outside_penalty_area_per_match
0,Arsenal,1,13,11,0.85,35,25,71.4,6,46.2,2,1,0,1,0.0,1,0,1,1,9.8,0.26,-0.2,-0.01,49,161,30.4,327,64,38.2,35.2,53,67.9,53.9,136,7,5.1,14,1.08,15.4,2.692,1.923,0.846,0.077,0.0,0.077,0.077,0.754,3.769,12.385,25.154,4.923,2.938,2.708,4.077,10.462,0.538,1.077
1,Aston Villa,2,14,21,1.5,59,40,67.8,3,21.4,3,2,1,0,33.3,2,1,3,3,17.7,0.26,-0.3,-0.02,69,199,34.7,386,57,39.6,35.4,90,51.1,42.5,163,26,16.0,6,0.43,11.1,4.214,2.857,1.5,0.143,0.071,0.214,0.214,1.264,4.929,14.214,27.571,4.071,2.829,2.529,6.429,11.643,1.857,0.429
2,Bournemouth,2,14,32,2.29,71,41,62.0,3,21.4,5,5,0,0,0.0,5,0,9,2,26.3,0.31,-3.7,-0.26,102,212,48.1,290,60,45.5,36.6,128,62.5,46.1,297,15,5.1,6,0.43,9.8,5.071,2.929,2.286,0.357,0.0,0.643,0.143,1.879,7.286,15.143,20.714,4.286,3.25,2.614,9.143,21.214,1.071,0.429


### Clean playing time

In [13]:
cleaned_playing_time_df = clean_playing_time_table_df(playing_time_df)
cleaned_playing_time_df.head(3)

Unnamed: 0,Squad,no_of_players_used,average_age,MP,no_of_subs_used,minutes_per_sub,no_of_subs_unused,goals_scored_minus_against_per_90,total_expect_goals,total_expected_goals_against,xg_minus_xga,xg_minus_xga_per_90,no_of_subs_used_per_match,no_of_subs_unused_per_match,total_expect_goals_per_match,total_expected_goals_against_per_match
0,Arsenal,23,24.7,13,51,12,66,1.54,24.1,11.2,12.9,0.99,3.923,5.077,1.854,0.862
1,Aston Villa,23,27.4,14,54,20,72,-0.5,15.6,18.6,-3.0,-0.21,3.857,5.143,1.114,1.329
2,Bournemouth,25,26.8,14,48,17,78,-1.21,9.2,23.4,-14.1,-1.01,3.429,5.571,0.657,1.671


# Conclusion

We can now cleaned the team data from the fbref website. This involves changing data types for specific columns, renaming columns as well as creating per match columns. The outputting dataframes provide all the team level data we will need to do analysis. Needs to be said that many of the dataframes have lots of columns which can be hard to deal with. The purpose of this was to clean the data and output all data we can achieved to then filter these tables according to specific analysis later.