In [4]:
import pandas as pd
import os
from thefuzz import fuzz, process
import unicodedata

## Util functions


### Seasons with id_dict


In [107]:
def save_fpl_players(fpl_subfolders, season):
    for folder in fpl_subfolders:
        player_name = folder.split('/')[4].split('_')
        clean_player_name = " ".join(player_name[0:-1])
        player_df = pd.read_csv(str(folder+'/gw.csv'))

        player_dir = './data/joint/'+str(season)+'/fpl/'
         # Check if player_dir exists
        if not os.path.exists(player_dir):
            os.makedirs(player_dir)
        player_df.to_csv(player_dir + str(clean_player_name) + '.csv', index_label=False)

    print('successfully cleaned 20'+ season +' fpl data')

# Understat Files
def save_under_players(understat_files, season):
    for file_ in understat_files:
        player_name = file_.split('/')[4].split('_')
        clean_player_name = " ".join(player_name[0:-1])
        player_df = pd.read_csv(str(file_))

        player_dir = './data/joint/'+str(season)+'/understat/'
        # Check if player_dir exists
        if not os.path.exists(player_dir):
            os.makedirs(player_dir)
        player_df.to_csv(player_dir + str(clean_player_name) + '.csv', index_label=False)

    print('successfully cleaned understat 20'+ season +' data')

def joint_players_info(fpl_player_folder_path, understat_player_folder_path, season):
    fpl_subfolders = [ f.path for f in os.scandir(fpl_player_folder_path) if f.is_dir() ]
    under_files = [ f.path for f in os.scandir(understat_player_folder_path) if f.is_file() ]

    save_fpl_players(fpl_subfolders, season)
    save_under_players(under_files, season)

    print('20'+ season +' fpl and understat data now in `joint` folder')

In [108]:
def merge_fpl_understat_data(fpl_player_folder_path, understat_player_folder_path, season):
    joint_players_info(fpl_player_folder_path, understat_player_folder_path, season)

    joint_fpl_data_path = "./data/joint/"+ season + "/fpl/"
    joint_understat_path = "./data/joint/"+ season + "/understat/"

    understat_files= next(os.walk("./data/joint/"+ season + "/understat/"), (None, None, []))[2]  # [] if no file
    fpl_files = next(os.walk( "./data/joint/"+ season +"/fpl"), (None, None, []))[2]  # [] if no file
    player_ids = pd.read_csv('./data/20'+ season +'/id_dict.csv')

    understat_names = [file_.split('.')[0] for file_ in understat_files]
    fpl_names = [file_.split('.')[0] for file_ in fpl_files]

    for name in understat_names:
        # print(name)
        player_index = player_ids.index[player_ids['Understat_Name'] == name].tolist()

        if(player_index):
            indexed_fpl_name = player_ids.loc[player_index[0], 'FPL_Name']

            if(indexed_fpl_name):
                # Use Fuzzy matching get the corresponding fpl name
                fuzzy_fpl_player_name = process.extractOne(indexed_fpl_name, fpl_names, scorer=fuzz.partial_token_sort_ratio)

                fpl_player_data = pd.read_csv(joint_fpl_data_path + str(fuzzy_fpl_player_name[0])+ '.csv')
                understat_player_data = pd.read_csv(joint_understat_path + name + '.csv')

                # Change 'kickoff_time' column name to 'date
                fpl_player_data = fpl_player_data.rename(columns={'kickoff_time': 'date'})
                # change the formats: From 2021-10-03T13:00:00Z to 2021-10-03
                fpl_player_data.date = fpl_player_data.date.apply(lambda x: x.split('T')[0])

                # Dates are of the form 2021-10-03T13:00:00Z
                fpl_dates_min = fpl_player_data['date'].min()
                fpl_dates_max = fpl_player_data['date'].max()

                # Filter out player info not in the range of dates we are dealing with
                understat_filtered = understat_player_data[(pd.to_datetime(understat_player_data['date']) > pd.to_datetime(fpl_dates_min))
                                                            & (pd.to_datetime(understat_player_data['date']) < pd.to_datetime(fpl_dates_max) )]

                # Marge fpl_player_data with understat_player_data if the dates match
                player_data_merged = fpl_player_data.merge(understat_filtered, on="date")
                if(player_data_merged.shape[0]):
                    merged_dir = './data/joint/'+ season +'/merged/'
                    if not os.path.exists(merged_dir):
                        os.makedirs(merged_dir)

                    player_data_merged.to_csv(merged_dir+ indexed_fpl_name +'.csv', index_label=False )


    print('sucessfully merged 20 '+ season +' data')

In [109]:
def add_difficulty(season):
    print('====> Starting to add difficulty features to 20'+season)
    merged = './data/joint/' + season + '/merged/'
    player_names = next(os.walk((merged), (None, None, [])))[2]
    fixtures = pd.read_csv('./data/20' + season + '/fixtures.csv')

    # for name in player_names:
    #     player  = pd.read_csv('./data/joint/' + season + '/merged/'+ name)
    #     new_col = {'team_h_difficulty':[], 'team_a_difficulty':[]}

    #     for index, row in player.iterrows():
    #         for idx, row_2 in fixtures.iterrows():
    #             if row['fixture'] == row_2['id']:
    #                 new_col['team_h_difficulty'].append(row_2['team_h_difficulty'])
    #                 new_col['team_a_difficulty'].append(row_2['team_a_difficulty'])

    #     with_new_cols = [player, pd.DataFrame(new_col)]
    #     player_with_difficult = pd.concat(with_new_cols, axis=1)
    #     new_col_dir = './data/joint/' + season +'/merged_extras/'
    #     if not os.path.exists(new_col_dir):
    #         os.makedirs(new_col_dir)
    #     player_with_difficult.to_csv(new_col_dir + name)

    # Loop over each player file in player_names
    for name in player_names:
        # Load player data
        player = pd.read_csv('./data/joint/' + season + '/merged/' + name)

        # Function to get the difficulty and was_home columns based on the fixture
        def get_fixture_info(row):
            # Filter the relevant fixture
            fixture = fixtures[fixtures['id'] == row['fixture']]
            if not fixture.empty:
                fixture = fixture.iloc[0]  # Get the first (and only) match

                # Get the team difficulties
                team_h_difficulty = fixture['team_h_difficulty']
                team_a_difficulty = fixture['team_a_difficulty']

                return pd.Series([team_h_difficulty, team_a_difficulty])
            else:
                # Return NaN if no matching fixture found
                return pd.Series([None, None, None])

        # Apply the function to each row of player
        player[['team_h_difficulty', 'team_a_difficulty']] = player.apply(get_fixture_info, axis=1)

        # Save the updated DataFrame with the new columns
        new_col_dir = './data/joint/' + season + '/merged_extras/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        player.to_csv(new_col_dir + name, index=False)

    print('****> successfully added difficulty features to 20'+season)

In [110]:
merge_fpl_understat_data("./data/2021-22/players/", "./data/2021-22/understat/", "21-22")
merge_fpl_understat_data("./data/2022-23/players/", "./data/2022-23/understat/", "22-23")
merge_fpl_understat_data("./data/2023-24/players/", "./data/2023-24/understat/", "23-24")

successfully cleaned 2021-22 fpl data
successfully cleaned understat 2021-22 data
2021-22 fpl and understat data now in `joint` folder
sucessfully merged 20 21-22 data
successfully cleaned 2022-23 fpl data
successfully cleaned understat 2022-23 data
2022-23 fpl and understat data now in `joint` folder
sucessfully merged 20 22-23 data
successfully cleaned 2023-24 fpl data
successfully cleaned understat 2023-24 data
2023-24 fpl and understat data now in `joint` folder
sucessfully merged 20 23-24 data


### Adding fixture difficulty


In [111]:
add_difficulty('21-22')
add_difficulty('22-23')
add_difficulty('23-24')

====> Starting to add difficulty features to 2021-22
****> successfully added difficulty features to 2021-22
====> Starting to add difficulty features to 2022-23
****> successfully added difficulty features to 2022-23
====> Starting to add difficulty features to 2023-24
****> successfully added difficulty features to 2023-24


In [68]:
pd.read_csv('./data/joint/21-22/merged_extras_xP/Aaron Connolly.csv')

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,xA,assists_y,key_passes,npg,npxG,xGChain,xGBuildup,team_h_difficulty,team_a_difficulty,xP
0,0,0,-2,0,1.1,72,12,0,0,0.5,...,0.0,0,0,0,0.562996,0.591031,0.028035,2,2,1.0
1,0,0,2,0,0.3,72,54,0,0,0.1,...,0.0,0,0,0,0.0,0.0,0.0,2,2,0.3
2,0,0,7,0,10.4,72,146,0,0,2.3,...,0.380411,0,1,0,0.036493,0.416904,0.0,2,3,0.2
3,0,0,5,0,1.3,72,163,1,0,0.2,...,0.0,0,0,0,0.0,0.0,0.0,3,2,0.6


### Add Expected points


In [45]:
# pd.read_csv('./data/20'+ '22-23' +'/gws/merged_gw.csv').rename(columns={'position': 'player_position'})
pd.read_csv('./data/joint/21-22/merged_extras/Aaron Cresswell.csv')['position']

0     DL
1     DL
2     DL
3     DL
4     DL
5     DL
6     DL
7     DL
8     DL
9     DL
10    DL
11    DL
12    DL
13    DL
14    DL
15    DL
16    DL
17    DL
18    DL
19    DL
20    DL
21    DC
22    DL
23    DL
24    DL
25    DC
26    DC
27    DL
28    DL
Name: position, dtype: object

In [58]:
def add_xP(season):
    print('================> starting season 20'+season)
    players_paths = next(os.walk('./data/joint/'+ season +'/merged_extras', [None], [None],[]))[2]
    # players_paths
    for path in players_paths:
        player = pd.read_csv('./data/joint/'+ season +'/merged_extras/'+ path)
        merged = pd.read_csv('./data/20'+ season +'/gws/merged_gw.csv')

        player = player.drop(['position'], axis=1)
        merged_player = pd.merge(player, merged[['element', 'fixture', 'xP','position']], on=['element', 'fixture'], how='left')

        # Save the updated DataFrame with the new columns
        new_col_dir = './data/joint/'+ season +'/merged_extras_xP/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        merged_player.to_csv(new_col_dir + path, index=False)

    print('<<<<================ starting season 20'+season)



In [59]:
add_xP('21-22')
add_xP('22-23')
add_xP('23-24')




In [12]:
columns = ['bonus', 'clean_sheets','expected_assists (xA)', 'expected_goal_involvements', 'expected_goals (xG)', 'expected_goals_conceded', 'ict_index', 'minutes',
            'red_cards','total_points', 'was_home', 'yellow_cards', 'shots', 'npxG', 'xGChain', 'xGBuildup', 'team_h_difficulty', 'team_a_difficulty', 'opponent_team',
             'threat', 'value', 'key_passes',
            ]

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,roster_id,xA,assists_y,key_passes,npg,npxG,xGChain,xGBuildup,team_h_difficulty,team_a_difficulty
0,0,0,15,1,1.5,104,0.01,0.01,0.0,0.36,...,605102,0.0,0,0,0,0.0,0.09799,0.09799,2,3
1,0,0,13,0,3.8,104,0.01,0.01,0.0,0.8,...,606892,0.0,0,0,0,0.0,0.161078,0.161078,2,2
2,0,0,19,0,1.2,104,0.02,0.06,0.04,2.02,...,608645,0.0,0,0,0,0.01984,0.340657,0.320817,2,2
3,0,0,6,0,1.3,104,0.0,0.04,0.04,1.43,...,610660,0.0,0,0,0,0.077296,0.111838,0.034542,2,4
4,0,0,6,0,2.3,104,0.0,0.05,0.05,1.81,...,612334,0.0,0,0,0,0.039281,0.039281,0.021787,2,2
5,0,0,0,0,4.7,104,0.01,0.01,0.0,0.22,...,615262,0.0,0,0,0,0.0,0.0,0.0,2,2
6,0,0,0,0,0.8,104,0.0,0.04,0.04,1.43,...,616482,0.0,0,0,0,0.045288,0.0,0.0,2,4
7,0,0,21,1,2.0,104,0.0,0.0,0.0,1.11,...,619575,0.0,0,0,0,0.0,0.363263,0.363263,2,3


```
    Total Points – Bonus Points (tp-bp), Minutes, Yellow Cards, Red Cards, Expected Goals (xG), Expected Assists (xA), Non-penalty Expected Goals (npxG),
    Shots, Expected Goals Against, Expected_goal_involvements,  clean_sheets, ict_index, opponent_team, Expected Goals Buildup (xG Buildup), threat, value,
    Key Passes,


    Games,  Expected Goals Chain (xG Chain),  Non-penalty Expected Goal Difference (npxGD), Non-penalty Expected Goals Against (npxGA), Expected Points (xPts)
```


<!-- ### Merge data -->


In [60]:
def merge_files(season):
    paths = next(os.walk('./data/joint/'+ season +'/merged_extras_xP', [None], [None],[]))[2]
    files_list = [pd.read_csv('./data/joint/'+ season +'/merged_extras_xP/' + path)  for  path in paths ]
    merged_files = pd.concat(files_list)

    # Save the new DataFrame
    new_col_dir = './data/joint/'+ season +'/'
    print(new_col_dir)

    merged_files.to_csv(new_col_dir  +'merged_player_data.csv', index=False)
    # print('<<<<================ starting season 20'+ season)


In [61]:
merge_files('21-22')
merge_files('22-23')
merge_files('23-24')

./data/joint/21-22/
./data/joint/22-23/
./data/joint/23-24/
