In [19]:
import pandas as pd
import os
from thefuzz import fuzz, process
import unicodedata

### Util functions


In [20]:
def save_fpl_players(fpl_subfolders, season):

    for folder in fpl_subfolders:
        # print(folder)
        # print('flp: ', folder)
        player_name = folder.split('/')[6].split('_')
        clean_player_name = " ".join(player_name[0:-1])
        player_df = pd.read_csv(str(folder+'/gw.csv'))
        player_dir = './data/joint/'+str(season)+'/fpl/'
         # Check if player_dir exists


        if not os.path.exists(player_dir):
            os.makedirs(player_dir)
        player_df.to_csv(player_dir + str(clean_player_name) + '.csv', index_label=False)

    print('successfully cleaned 20'+ season +' fpl data')

# Understat Files
def save_under_players(understat_files, season):
    for file_ in understat_files:
        player_name = file_.split('/')[6].split('_')
        clean_player_name = " ".join(player_name[0:-1])
        player_df = pd.read_csv(str(file_))

        player_dir = './data/joint/'+str(season)+'/understat/'
        # Check if player_dir exists
        if not os.path.exists(player_dir):
            os.makedirs(player_dir)
        player_df.to_csv(player_dir + str(clean_player_name) + '.csv', index_label=False)

    print('successfully cleaned understat 20'+ season +' data')

def joint_players_info(fpl_player_folder_path, understat_player_folder_path, season):
    fpl_subfolders = [ f.path for f in os.scandir(fpl_player_folder_path) if f.is_dir() ]
    under_files = [ f.path for f in os.scandir(understat_player_folder_path) if f.is_file() ]

    save_fpl_players(fpl_subfolders, season)
    save_under_players(under_files, season)

    print('20'+ season +' fpl and understat data now in `joint` folder')

In [21]:
def merge_fpl_understat_data(fpl_player_folder_path, understat_player_folder_path, season):
    joint_players_info(fpl_player_folder_path, understat_player_folder_path, season)

    joint_fpl_data_path = "./data/joint/"+ season + "/fpl/"
    joint_understat_path = "./data/joint/"+ season + "/understat/"

    understat_files= next(os.walk("./data/joint/"+ season + "/understat/"), (None, None, []))[2]  # [] if no file
    fpl_files = next(os.walk( "./data/joint/"+ season +"/fpl"), (None, None, []))[2]  # [] if no file
    player_ids = pd.read_csv('./data/20'+ season +'/id_dict.csv')

    understat_names = [file_.split('.')[0] for file_ in understat_files]
    fpl_names = [file_.split('.')[0] for file_ in fpl_files]

    # print('[[[[[[[[[[[]]]]]]]]]]]', understat_names)
    for name in understat_names:
        # print(name)
        player_index = player_ids.index[player_ids['Understat_Name'] == name].tolist()

        if(player_index):
            indexed_fpl_name = player_ids.loc[player_index[0], 'FPL_Name']
            if(indexed_fpl_name):
                # Use Fuzzy matching get the corresponding fpl name
                fuzzy_fpl_player_name = process.extractOne(indexed_fpl_name, fpl_names, scorer=fuzz.partial_token_sort_ratio)

                fpl_player_data = pd.read_csv(joint_fpl_data_path + str(fuzzy_fpl_player_name[0])+ '.csv')
                understat_player_data = pd.read_csv(joint_understat_path + name + '.csv')
                # Change 'kickoff_time' column name to 'date
                fpl_player_data = fpl_player_data.rename(columns={'kickoff_time': 'date'})
                # change the formats: From 2021-10-03T13:00:00Z to 2021-10-03
                fpl_player_data.date = fpl_player_data.date.apply(lambda x: x.split('T')[0])

                # Dates are of the form 2021-10-03T13:00:00Z
                fpl_dates_min = fpl_player_data['date'].min()
                fpl_dates_max = fpl_player_data['date'].max()


                # Filter out player info not in the range of dates we are dealing with
                understat_filtered = understat_player_data[(pd.to_datetime(understat_player_data['date']) >= pd.to_datetime(fpl_dates_min))
                                                            & (pd.to_datetime(understat_player_data['date']) <= pd.to_datetime(fpl_dates_max) )]

                # Marge fpl_player_data with understat_player_data if the dates match
                player_data_merged = fpl_player_data.merge(understat_filtered, on="date")
                # print(player_data_merged[player_data_merged['round']==10])


                if(player_data_merged.shape[0]):
                    merged_dir = './data/joint/'+ season +'/merged/'
                    if not os.path.exists(merged_dir):
                        os.makedirs(merged_dir)

                    player_data_merged.to_csv(merged_dir+ indexed_fpl_name +'.csv', index_label=False )


    print('sucessfully merged 20 '+ season +' data')

In [22]:
def add_difficulty(season):
    print('====> Starting to add difficulty features to 20'+season)
    merged = './data/joint/' + season + '/merged/'

    player_names = next(os.walk((merged), (None, None, [])))[2]
    fixtures = pd.read_csv('./data/20' + season + '/fixtures.csv')

    # Loop over each player file in player_names
    for name in player_names:
        # Load player data
        player = pd.read_csv('./data/joint/' + season + '/merged/' + name)

        # Function to get the difficulty and was_home columns based on the fixture
        def get_fixture_info(row):
            # Filter the relevant fixture
            fixture = fixtures[fixtures['id'] == row['fixture']]
            if not fixture.empty:
                fixture = fixture.iloc[0]  # Get the first (and only) match

                # Get the team difficulties
                team_h_difficulty = fixture['team_h_difficulty']
                team_a_difficulty = fixture['team_a_difficulty']
                event = fixture['event']

                return pd.Series([team_h_difficulty, team_a_difficulty, event])
            else:
                # Return NaN if no matching fixture found
                return pd.Series([None, None, None])

        # Apply the function to each row of player
        player[['team_h_difficulty', 'team_a_difficulty', 'event']] = player.apply(get_fixture_info, axis=1)

        # Save the updated DataFrame with the new columns
        new_col_dir = './data/joint/' + season + '/merged_extras/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        player.to_csv(new_col_dir + name, index=False)

    # print(player[player['round']== 10])
    print('****> successfully added difficulty features to 20'+season)

In [23]:
def add_xP(season):
    print('================> starting season 20'+season)
    players_paths = next(os.walk('./data/joint/'+ season +'/merged_extras', [None], [None],[]))[2]
    # players_paths
    for path in players_paths:
        player = pd.read_csv('./data/joint/'+ season +'/merged_extras/'+ path)
        merged = pd.read_csv('./data/20'+ season +'/gws/merged_gw.csv')

        player = player.drop(['position'], axis=1)
        merged_player = pd.merge(player, merged[['element', 'fixture', 'xP','position']], on=['element', 'fixture'], how='left')

        # Save the updated DataFrame with the new columns
        new_col_dir = './data/joint/'+ season +'/merged_extras_xP/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        merged_player.to_csv(new_col_dir + path, index=False)

    print('<<<<================ starting season 20'+season)

In [24]:
def add_rolling_avgs_3(season):
    print('================> starting season 20'+season)
    players_paths = next(os.walk('./data/joint/'+ season +'/merged_extras_xP', [None], [None],[]))[2]
    for path in players_paths:
        player = pd.read_csv('./data/joint/'+ season +'/merged_extras_xP/'+ path,sep=',', skipinitialspace=True)

        prev = [
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)]
            ]
        gwks = [1]

        features = ['clean_sheets', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'ict_index',
                    'influence', 'creativity', 'threat', 'minutes', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'yellow_cards', 'saves', 'starts',
                    'team_a_score', 'team_h_score', 'total_points', 'goals', 'shots', 'xG', 'xA', 'assists_y', 'key_passes', 'npg', 'npxG', 'xGChain',  'xGBuildup',  'xP', 'selected'
                    ]

        def rolling(row):
            row_items = [row.get(col, 0) for col in features]

            if row['event'] - gwks[0] == 0:
                del prev[3]
                prev.append(row_items)

            elif row['event'] - gwks[0] == 1:
                del prev[0]
                prev.append(row_items)
                gwks[0] = row['event']

            elif row['event'] - gwks[0] == 2:
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']

            else:
                del prev[0]
                del prev[0]
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']

            # print(row['event'], len(pd.Series([round((x+y+z)/3 ,2) for x,y,z in zip(prev[0], prev[1], prev[2])])),  prev[0], prev[1], prev[2], prev[3]) #
            return pd.Series([round((x+y+z) ,2) for x,y,z in zip(prev[0], prev[1], prev[2])])
        player[[f"{col}_3" for col in features]] = player.apply(rolling, axis=1)

        # Save the updated DataFrame with the new columns
        new_col_dir = f'./data/joint/{season}/merged_extras_rolled_3/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        player.to_csv(new_col_dir + path, index=False)

    print('<<<<================ starting season 20'+season)

def add_rolling_avgs_5(season):
    print('================> starting season 20'+season)
    players_paths = next(os.walk('./data/joint/'+ season +'/merged_extras_rolled_3', [None], [None],[]))[2]
    for path in players_paths:
        player = pd.read_csv('./data/joint/'+ season +'/merged_extras_rolled_3/'+ path,sep=',', skipinitialspace=True)

        prev = [
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)],
                [0 for i in range(34)]
            ]
        gwks = [1]

        features = ['clean_sheets', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'ict_index',
                    'influence', 'creativity', 'threat', 'minutes', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'yellow_cards', 'saves', 'starts',
                    'team_a_score', 'team_h_score', 'total_points', 'goals', 'shots', 'xG', 'xA', 'assists_y', 'key_passes', 'npg', 'npxG', 'xGChain',  'xGBuildup',  'xP', 'selected'
                    ]

        def rolling(row):
            row_items = [row.get(col, 0) for col in features]

            if row['event'] - gwks[0]== 0:
                del prev[5]
                prev.append(row_items)

            elif row['event'] - gwks[0] == 1:
                del prev[0]
                prev.append(row_items)
                gwks[0] = row['event']

            elif row['event'] - gwks[0] == 2:
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']

            elif row['event'] - gwks[0] == 3:
                del prev[0]
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']

            elif row['event'] - gwks[0] == 4:
                del prev[0]
                del prev[0]
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']
            else:
                del prev[0]
                del prev[0]
                del prev[0]
                del prev[0]
                del prev[0]
                del prev[0]
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append([0 for i in range(34)])
                prev.append(row_items)
                gwks[0] = row['event']

            return pd.Series([round((v+w+x+y+z) ,2) for v, w, x,y,z in zip(prev[0], prev[1], prev[2], prev[3], prev[4])])
        player[[f"{col}_5" for col in features]] = player.apply(rolling, axis=1)

        # Save the updated DataFrame with the new columns
        new_col_dir = f'./data/joint/{season}/merged_extras_rolled_5/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        player.to_csv(new_col_dir + path, index=False)

def add_rolling_avgs(season):
    add_rolling_avgs_3(season)
    add_rolling_avgs_5(season)

    print('<<<<================ Done rolling season 20'+season)

In [25]:
def odds(sns, nxt_gw=0):
    print('================> starting sns 20'+sns)

    # print(nxt_gw)
    # Load the data data once to avoid redundant file reads
    data = pd.read_csv('./data/odds/E0 '+ sns +'.csv')
    data = data.rename(columns={'HomeTeam': 'h_team', 'AwayTeam': 'a_team'})

    players_paths = next(os.walk('./data/joint/'+ sns +'/merged_extras_rolled_5', [None], [None],[]))[2]
    for path in players_paths:
        rolled = pd.read_csv('./data/joint/'+ sns +'/merged_extras_rolled_5/'+ path,sep=',', skipinitialspace=True)

        def add_odds(row, data):
            # Filter the data DataFrame for the matching teams
            match = data[(data['h_team'] == row['h_team']) & (data['a_team'] == row['a_team'])]
            # Check if a match is found

            if not match.empty:
                # Extract the relevant data values
                # Convert the data to probabilities
                odds_ = match.iloc[0]
                WHH = round(1/odds_['WHH'], 2)
                WHD = round(1/odds_['WHD'], 2)
                WHA = round(1/odds_['WHA'], 2)
                pts_bps = row['total_points'] - row['bonus']
                return pd.Series([pts_bps, WHH, WHD, WHA])
            else:
                print(row['h_team'], row['a_team'], '==========================>>>>', data['h_team'], data['a_team'])
                # Return NaN for rows with no match
                return pd.Series([None,None, None, None])

        # Apply the function to the 'rolled' DataFrame
        rolled[['pts_bps','whh', 'whd', 'wha']] = rolled.apply(add_odds, axis=1, data=data)

        # Save the updated DataFrame with the new columns
        new_col_dir = './data/joint/'+ sns +'/merged_extras_odds/'
        if not os.path.exists(new_col_dir):
            os.makedirs(new_col_dir)
        rolled.to_csv(new_col_dir + path, index=False)

In [26]:
def merge_files(season):
    paths = next(os.walk('./data/joint/'+ season +'/merged_extras_odds', [None], [None],[]))[2]
    files_list = [pd.read_csv('./data/joint/'+ season +'/merged_extras_odds/' + path)  for  path in paths ]
    merged_files = pd.concat(files_list)

    # Save the new DataFrame
    new_col_dir = './data/joint/'+ season +'/'

    merged_files.to_csv(new_col_dir  +'merged_player_data.csv', index=False)