In [17]:
import pandas as pd
import os

from sklearn.preprocessing import OneHotEncoder

def preprocess_values(df_this):
    """
    This function handles NaN values and converts object columns to numeric types.
    """

    # handle NaN values
    df_this = df_this.fillna(0)  # Replace NaNs with 0s
    
    # handle categorical values
    categorical_cols = list(df_this.select_dtypes(include=['object']).columns)

    # one-hot encode categorical columns
    myOneHotEncoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    df_encoded = pd.DataFrame(myOneHotEncoder.fit_transform(df_this[categorical_cols]))

    df_encoded.columns = myOneHotEncoder.get_feature_names_out(categorical_cols)
    df_this = df_this.drop(columns=categorical_cols, axis=1)
    df_this = pd.concat([df_encoded, df_this], axis=1)

    return df_this

def get_target_column(df_this, gw):
    """
    Returns the boolean 
    """
    target_col = f'total_points_gw{gw}'
    return df_this[target_col] > 4  # threshold for next gameweek points to be considered as a good performance

def get_df_for_gw(df_this, gw):
    """
    Returns a DataFrame with all data for the gameweek just before the one that is targeted.
    """
    if gw == 1:
        raise ValueError("Gameweek 1 does not have a previous gameweek to reference.")
    if gw > 38:
        raise ValueError("Gameweek must be between 1 and 38.")
    
    # get the columns for the gameweek just before the one that is targeted
    static_cols_to_keep = [col for col in df_this.columns if "_gw" not in col]
    gw_suffixes = [f"_gw{idx}" for idx in range(1, gw)]
    gw_cols_to_keep = [col for col in df_this.columns for suffix in gw_suffixes if col.endswith(suffix) ]

    # Filter the DataFrame to keep only the desired columns
    cols_to_keep = static_cols_to_keep + gw_cols_to_keep
    df_filtered = df_this[cols_to_keep]

    # get target column
    df_target_col = get_target_column(df_this, gw)

    # merge df_filtered with the target column
    df_filtered = df_filtered.merge(df_target_col.rename('target'), left_index=True, right_index=True)

    return df_filtered

def get_df():
    """
    Returns a DataFrame with all data for the 2022–23 Fantasy Premier League season.
    The DataFrame contains aggregated statistics for each player across all gameweeks.
    """

    # read the data
    csv_path = '../../../data/raw/vaastav_2022_23.csv'
    df = pd.read_csv(csv_path)

    # extract team and position information
    df_common = df.groupby('element').agg({'team': 'first',
                                             'position': 'first'
                                             }).reset_index()

    # select features
    cols_to_keep = ['minutes',
                    'goals_scored', 
                    'assists', 
                    'expected_goals', 
                    'expected_assists', 
                    'clean_sheets',
                    'ict_index',
                    'bps', 
                    'bonus', 
                    'total_points',
                    ]

    # pivot table to get unified dataframe
    df_multigw = df.pivot_table(index='element', 
                    columns = 'GW', 
                    values = cols_to_keep, 
                    aggfunc='sum').reset_index()

    # flatten columns with gameweek suffix
    df_multigw.columns = [f"{col}_gw{int(gw)}" if isinstance(gw, (int,float)) else col for col,gw in df_multigw.columns]

    # merge common information with gamew data
    df_multigw = df_common.merge(df_multigw, on='element', how='left')

    # set element as the index
    df_multigw.set_index('element', inplace=True)

    return df_multigw

In [18]:
df = get_df()
print(df.head())


            team position  assists_gw1  assists_gw2  assists_gw3  assists_gw4  \
element                                                                         
1        Arsenal      DEF          0.0          0.0          0.0          0.0   
2         Fulham       GK          0.0          0.0          0.0          0.0   
3        Arsenal      MID          0.0          1.0          1.0          0.0   
4        Arsenal      MID          0.0          0.0          0.0          0.0   
5        Arsenal      DEF          0.0          0.0          0.0          0.0   

         assists_gw5  assists_gw6  assists_gw8  assists_gw9  ...  \
element                                                      ...   
1                0.0          0.0          0.0          0.0  ...   
2                0.0          0.0          0.0          0.0  ...   
3                0.0          0.0          1.0          0.0  ...   
4                0.0          0.0          0.0          0.0  ...   
5                0.0    

In [19]:
# extract just data to predict gw X
gw = 5
df_gw = get_df_for_gw(df, gw)

In [20]:
# value preprocess
df_gw = preprocess_values(df_gw)

In [21]:
df_gw.iloc[0,:]

team_Arsenal        1.0
team_Aston Villa    0.0
team_Bournemouth    0.0
team_Brentford      0.0
team_Brighton       0.0
                   ... 
total_points_gw1    NaN
total_points_gw2    NaN
total_points_gw3    NaN
total_points_gw4    NaN
target              NaN
Name: 0, Length: 65, dtype: object

In [7]:
# extract X and y
y = df_gw.target
X = df_gw.drop(columns=['target'], axis=1)

In [11]:
y.isnull().value_counts()

target
False    778
True       1
Name: count, dtype: int64

In [12]:
df_gw.iloc[0,:]

team_Arsenal        1.0
team_Aston Villa    0.0
team_Bournemouth    0.0
team_Brentford      0.0
team_Brighton       0.0
                   ... 
total_points_gw1    NaN
total_points_gw2    NaN
total_points_gw3    NaN
total_points_gw4    NaN
target              NaN
Name: 0, Length: 65, dtype: object