In [98]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import os

import time

import scipy.stats as stats

import pulp

import unidecode

Loading the data from the files

In [99]:
folderpath = 'data/'

players_1617_df = pd.read_csv(folderpath+'2016-17/players_raw.csv')
players_1718_df = pd.read_csv(folderpath+'2017-18/players_raw.csv')
players_1819_df = pd.read_csv(folderpath+'2018-19/players_raw.csv')
players_1920_df = pd.read_csv(folderpath+'2019-20/players_raw.csv')
players_2021_df = pd.read_csv(folderpath+'2020-21/players_raw.csv')
players_2122_df = pd.read_csv(folderpath+'2021-22/players_raw.csv')
players_2223_df = pd.read_csv(folderpath+'2022-23/players_raw.csv')
players_2324_df = pd.read_csv(folderpath+'2023-24/players_raw.csv')

gws_1617_df = pd.read_csv(folderpath+'2016-17/gws/merged_gw.csv', encoding='latin')
gws_1718_df = pd.read_csv(folderpath+'2017-18/gws/merged_gw.csv', encoding='latin')
gws_1819_df = pd.read_csv(folderpath+'2018-19/gws/merged_gw.csv', encoding='latin')
gws_1920_df = pd.read_csv(folderpath+'2019-20/gws/merged_gw.csv', encoding='latin')
gws_2021_df = pd.read_csv(folderpath+'2020-21/gws/merged_gw.csv', encoding='latin')
gws_2122_df = pd.read_csv(folderpath+'2021-22/gws/merged_gw.csv', encoding='latin')
gws_2223_df = pd.read_csv(folderpath+'2022-23/gws/merged_gw.csv', encoding='latin')
gws_2324_df = pd.read_csv(folderpath+'2023-24/gws/merged_gw.csv', encoding='latin')

team_codes_df = pd.read_csv(folderpath+'teams.csv')
team_codes_df.columns.values[2:] = team_codes_df.columns[2:].str.replace('team_', '')

## Clean and process the dataframes

We want to add:
- Player Position
- Full Name (names are inconsistent across seasons and between df's)

We will also remove `Danny Ward` as there were two in the 18/19 season, both has 0 points so no harm in removing them.

In [100]:
# remove Danny Wards from 18/19 season
players_1819_df = players_1819_df[((players_1819_df.first_name == "Danny") & (players_1819_df.second_name=="Ward"))==False]
gws_1819_df = gws_1819_df[gws_1819_df.name.str.contains("Danny_Ward")==False]

Here we are adding the seasons onto the df

In [101]:
player_df_list = [players_1617_df, players_1718_df, players_1819_df, players_1920_df, players_2021_df, players_2122_df, players_2223_df, players_2324_df]
gw_df_list = [gws_1617_df, gws_1718_df, gws_1819_df, gws_1920_df, gws_2021_df, gws_2122_df, gws_2223_df, gws_2324_df]

# append season and season index to dfs

seasons = ['1617', '1718', '1819', '1920', '2021', '2122', '2223', '2324']
season_nums = list(range(len(seasons)))

for i in range(len(seasons)):
    player_df_list[i]['season'] = seasons[i]
    gw_df_list[i]['season'] = seasons[i]
    
    player_df_list[i]['season_num'] = season_nums[i]
    gw_df_list[i]['season_num'] = season_nums[i]

# combine dataframes from all seasons into one

players_df = pd.concat(player_df_list)
gws_df = pd.concat(gw_df_list)
players_df.reset_index(inplace=True)
gws_df.reset_index(inplace=True)

Then we apply a function to sort out the full names of the players

In [102]:
def get_full_name_playerdf(first_name, second_name):
    '''Creates full name, cleans up accents and makes processing easier'''

    full_name = first_name +'_' + second_name
    full_name = full_name.replace(" ", "_")
    full_name = full_name.replace("-", "_")
    full_name = unidecode.unidecode(full_name)
    
    return full_name

# Translate player positions into string for easier readability
positions_dict = {
    1: 'Keeper',
    2: 'Defender',
    3: 'Midfielder',
    4: 'Forward'
    
}

# Now apply full name and position changes. We also add the starting cost by a simple subtraction of two existing data columns.
players_df['full_name'] = players_df.apply(lambda x: get_full_name_playerdf(x.first_name, x.second_name), axis=1).str.lower()
players_df['position'] = players_df.element_type.map(positions_dict)
players_df['starting_cost'] = players_df.now_cost - players_df.cost_change_start_fall
players_df['cost_bin'] = players_df.now_cost.apply(lambda x: np.floor(x/10))

gws_df['full_name'] = gws_df.name.str.replace('_\d+','')
gws_df['full_name'] = gws_df['full_name'].str.replace(" ", "_").str.replace("-", "_").str.replace('_\d+','')
gws_df['full_name'] = gws_df['full_name'].apply(lambda x: unidecode.unidecode(x))
gws_df['full_name'] = gws_df['full_name'].str.lower()

This function gives us a df that contains all the GW info, plus info on the players team names and opponents.

In [103]:
def clean_gw_df(player_df, gw_df, team_codes_df):
    '''
    Cleans and merges gameweek data with player information and team codes to return a DataFrame 
    containing player positions, player's team names, and opponent's team names.

    Returns:
    pd.DataFrame: A DataFrame with the original gameweek data enriched with player positions, 
                player's team names, and opponent's team names.
    '''

    pdf = player_df.copy()[['full_name', 'season', 'position', 'player_team_name']]
    gdf = gw_df.copy()
    gdf = gdf.merge(pdf, on=['full_name', 'season'], how='left')
    
    dfs = []
    for s, group in gdf.groupby('season'):

        temp_code_df = team_codes_df[['team', s]]
        temp_code_df = temp_code_df.dropna()
        
        group = group[['opponent_team']]
        group['opponent_team_name'] = group.opponent_team.map(temp_code_df.set_index(s).team)
        dfs.append(group[['opponent_team_name']])
        
    out_df = pd.concat(dfs, axis=0)
    out_df = pd.concat([gdf, out_df], axis=1)
    return out_df

In [104]:
gws_df.opponent_team = gws_df.opponent_team.astype(float)
players_df['player_team_name'] = players_df.team_code.map(team_codes_df.set_index('team_code').team)
gws_df = clean_gw_df(players_df, gws_df, team_codes_df)

gws_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['opponent_team_name'] = group.opponent_team.map(temp_code_df.set_index(s).team)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['opponent_team_name'] = group.opponent_team.map(temp_code_df.set_index(s).team)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['opponent_team_name'] = gro

Unnamed: 0,index,name,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,...,xP,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,starts,full_name,position_y,player_team_name,opponent_team_name
0,0,Aaron_Cresswell,0,0.0,0.0,0.0,0,0,0,0.0,...,,,,,,,aaron_cresswell,Defender,West Ham United,Chelsea
1,1,Aaron_Lennon,0,3.0,0.0,0.0,0,6,0,1.0,...,,,,,,,aaron_lennon,Midfielder,Everton,Tottenham Hotspur
2,2,Aaron_Ramsey,0,26.0,0.0,0.0,0,5,0,2.0,...,,,,,,,aaron_ramsey,Midfielder,Arsenal,Liverpool
3,3,Abdoulaye_Doucouré,0,0.0,0.0,0.0,0,0,0,0.0,...,,,,,,,abdoulaye_doucoure,Midfielder,Watford,Southampton
4,4,Abdul Rahman_Baba,0,0.0,0.0,0.0,0,0,0,0.0,...,,,,,,,abdul_rahman_baba,Defender,Chelsea,West Ham United
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196664,29720,Kyle Walker,0,,,,0,15,0,,...,7.5,0.06,0.06,0.0,0.36,1.0,kyle_walker,Defender,Manchester City,West Ham United
196665,29721,Jacob Brown,0,,,,0,0,0,,...,0.0,0.00,0.00,0.0,0.00,0.0,jacob_brown,Forward,,Fulham
196666,29722,Vicente Guaita,0,,,,0,0,0,,...,0.0,0.00,0.00,0.0,0.00,0.0,vicente_guaita,Keeper,Crystal Palace,Aston Villa
196667,29723,Braian Ojeda RodrÃ­guez,0,,,,0,0,0,,...,0.0,0.00,0.00,0.0,0.00,0.0,braian_ojeda_rodraguez,,,Burnley
