In [6]:
import pandas as pd
from datetime import datetime

# URL to the player statistics file
players_stats_url = 'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/2025-2026/playerstats.csv'

players_url = 'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/2025-2026/players.csv'


# Load the data
players_stats_df = pd.read_csv(player_data_url)
players_df = pd.read_csv(player_url)

print("Player stats loaded successfully!")
players_stats_df.head()

print("Player information loaded!")
players_df.head()

Player stats loaded successfully!
Player information loaded!


Unnamed: 0,player_code,player_id,first_name,second_name,web_name,team_code,position
0,500040,662,Cristhian,Mosquera,Mosquera,3,Defender
1,224117,666,Viktor,Gyökeres,Gyökeres,3,Forward
2,221466,72,Marcos,Senesi Barón,Senesi,91,Defender
3,54469,73,Adam,Smith,Smith,91,Defender
4,494521,74,Adrien,Truffert,Truffert,91,Defender


In [7]:
# List of essential columns to select
core_features = [
    'id', 'web_name', 'now_cost', 'selected_by_percent', 'form',
    'minutes', 'total_points', 'bonus', 'bps', 'expected_goals_per_90', 
    'expected_assists_per_90', 'expected_goal_involvements_per_90', 
    'expected_goals_conceded_per_90', 'starts_per_90', 'clean_sheets_per_90', 
    'saves_per_90', 'corners_and_indirect_freekicks_order', 
    'direct_freekicks_order', 'penalties_order'
]

# Create the new DataFrame
selected_df = players_stats_df[core_features].copy()

# Fill missing values for set piece takers
for col in ['corners_and_indirect_freekicks_order', 'direct_freekicks_order', 'penalties_order']:
    selected_df[col] = selected_df[col].fillna(0)

print("Core features selected.")
selected_df.head()

Core features selected.


Unnamed: 0,id,web_name,now_cost,selected_by_percent,form,minutes,total_points,bonus,bps,expected_goals_per_90,expected_assists_per_90,expected_goal_involvements_per_90,expected_goals_conceded_per_90,starts_per_90,clean_sheets_per_90,saves_per_90,corners_and_indirect_freekicks_order,direct_freekicks_order,penalties_order
0,1,Raya,5.5,19.9,10.0,90,10,3,38,0.0,0.0,0.0,1.52,1.0,1.0,7.0,0.0,0.0,0.0
1,16,Saka,10.0,17.0,0.0,1724,127,18,508,0.36,0.41,0.77,0.93,1.04,0.37,0.0,1.0,4.0,1.0
2,33,M.Bizot,4.5,0.3,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45,Sousa,4.0,0.3,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24,Nørgaard,5.5,0.5,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Select only the columns we need from players_df to avoid clutter
player_info_df = players_df[['player_id', 'position', 'team_code']]

# Merge the two DataFrames
# left_on='id' tells pandas to use the 'id' column from the left DataFrame (selected_df)
# right_on='player_id' tells it to use the 'player_id' column from the right DataFrame (player_info_df)
merged_df = pd.merge(selected_df, player_info_df, left_on='id', right_on='player_id')

print("Successfully merged player stats with player information.")
merged_df.head()

Successfully merged player stats with player information.


Unnamed: 0,id,web_name,now_cost,selected_by_percent,form,minutes,total_points,bonus,bps,expected_goals_per_90,...,expected_goals_conceded_per_90,starts_per_90,clean_sheets_per_90,saves_per_90,corners_and_indirect_freekicks_order,direct_freekicks_order,penalties_order,player_id,position,team_code
0,1,Raya,5.5,19.9,10.0,90,10,3,38,0.0,...,1.52,1.0,1.0,7.0,0.0,0.0,0.0,1,Goalkeeper,3
1,16,Saka,10.0,17.0,0.0,1724,127,18,508,0.36,...,0.93,1.04,0.37,0.0,1.0,4.0,1.0,16,Midfielder,3
2,33,M.Bizot,4.5,0.3,0.0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33,Goalkeeper,7
3,45,Sousa,4.0,0.3,0.0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45,Defender,7
4,24,Nørgaard,5.5,0.5,0.0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,Midfielder,3


In [4]:
import pandas as pd

# List to hold the DataFrame for each completed gameweek
all_gws_data = []

# --- Configuration ---
SEASON_CURRENT = "2025-2026"
COMPLETED_GAMEWEEKS = [1] 

print(f"--- Processing Season: {SEASON_CURRENT} ---")

# --- Load the master helper files ---
# players.csv links player 'id' to their 'team_code'
players_url = f'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/{SEASON_CURRENT}/players.csv'
players_df = pd.read_csv(players_url)

# teams.csv links team 'code' to the team 'id' used in fixtures
teams_url = f'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/{SEASON_CURRENT}/teams.csv'
teams_df = pd.read_csv(teams_url)


# --- Loop through completed gameweeks ---
for gw in COMPLETED_GAMEWEEKS:
    try:
        # Load the three essential files for the gameweek
        pre_stats_url = f'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/{SEASON_CURRENT}/By%20Gameweek/GW{gw-1}/playerstats.csv'
        fixtures_url = f'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/{SEASON_CURRENT}/By%20Gameweek/GW{gw}/fixtures.csv'
        post_stats_url = f'https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/{SEASON_CURRENT}/By%20Gameweek/GW{gw}/playerstats.csv'

        pre_stats_df = pd.read_csv(pre_stats_url)
        fixtures_df = pd.read_csv(fixtures_url)
        post_stats_df = pd.read_csv(post_stats_url)

        # --- Perform the 3-Way Merge ---
        # 1. Merge pre_stats with players_df to get team_code
        merged_df = pd.merge(pre_stats_df, players_df[['player_id', 'team_code']], left_on='id', right_on='player_id')
        
        # 2. Merge the result with teams_df to get the final team_id
        merged_df = pd.merge(merged_df, teams_df[['code', 'id']], left_on='team_code', right_on='code')
        merged_df.rename(columns={'id_y': 'team_id'}, inplace=True)

        # 3. Get the results (our target variable) and merge them in
        results_df = post_stats_df[['id', 'event_points']]
        gw_df = pd.merge(merged_df, results_df, on='id')

        # --- Add Opponent ELO ---
        def get_opponent_elo(team_id, fixtures):
            match = fixtures[(fixtures['home_team'] == team_id) | (fixtures['away_team'] == team_id)]
            if not match.empty:
                return match.iloc[0]['away_team_elo'] if match.iloc[0]['home_team'] == team_id else match.iloc[0]['home_team_elo']
            return None
        
        gw_df['opponent_elo'] = gw_df['team_id'].apply(lambda x: get_opponent_elo(x, fixtures_df))
        
        gw_df['gameweek'] = gw
        all_gws_data.append(gw_df)
        print(f"Successfully processed GW{gw}")

    except Exception as e:
        print(f"Could not process GW{gw}. Error: {e}")

# --- Final Step: Combine everything ---
if all_gws_data:
    master_training_df = pd.concat(all_gws_data, ignore_index=True)
    print("\n-----------------------------------------")
    print("Master training DataFrame created successfully!")
    print(f"Shape of the final DataFrame: {master_training_df.shape}")
    print(master_training_df[['web_name', 'team_id', 'opponent_elo', 'event_points']].head())
else:
    print("\nNo data was processed. The master DataFrame is empty.")

--- Processing Season: 2025-2026 ---
Could not process GW1. Error: 'id'

No data was processed. The master DataFrame is empty.
