In [1]:
import pandas as pd
import os

raw_path = os.path.join("..", "data", "raw")

df_game = pd.read_csv(os.path.join(raw_path, "games.csv"))
df_player = pd.read_csv(os.path.join(raw_path, "players.csv"))
df_stats = pd.read_csv(os.path.join(raw_path, "stats.csv"))

In [2]:
df_game.columns

Index(['GameId', 'Year', 'Round', 'Date', 'MaxTemp', 'MinTemp', 'Rainfall',
       'Venue', 'StartTime', 'Attendance', 'HomeTeam', 'HomeTeamScoreQT',
       'HomeTeamScoreHT', 'HomeTeamScore3QT', 'HomeTeamScoreFT',
       'HomeTeamScore', 'AwayTeam', 'AwayTeamScoreQT', 'AwayTeamScoreHT',
       'AwayTeamScore3QT', 'AwayTeamScoreFT', 'AwayTeamScore'],
      dtype='object')

In [7]:
df_stats.columns
df_stats.head()

Unnamed: 0,GameId,Year,Round,Team,PlayerId,PlayerName,GameNumber,Disposals,Kicks,Marks,...,BrownlowVotes,ContestedPossessions,UncontestedPossessions,ContestedMarks,MarksInside50,OnePercenters,Bounces,GoalAssists,%Played,Subs
0,2012R0105,2012,Round 1,Adelaide,2011675768,Ian Callinan,4,18,14,5,...,0,9,9,0,3,0,0,0,84,-
1,2012R0105,2012,Round 1,Adelaide,2008681760,Patrick Dangerfield,65,25,13,3,...,0,14,11,0,1,1,1,0,92,-
2,2012R0105,2012,Round 1,Adelaide,2000686938,Michael Doughty,208,17,5,4,...,0,8,8,0,0,2,0,0,81,-
3,2012R0105,2012,Round 1,Adelaide,2006687579,Richard Douglas,96,19,14,6,...,0,6,16,0,1,5,0,2,83,-
4,2012R0105,2012,Round 1,Adelaide,2010728130,Ricky Henderson,31,8,5,1,...,0,2,5,0,0,1,0,1,36,Off


In [4]:
df_player.columns

Index(['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position'], dtype='object')

## Data pre-processing on df_player

In [5]:
def players_cast_types(players_df: pd.DataFrame) -> pd.DataFrame:
    df = players_df.copy()

    if "PlayerId" in df.columns:
        df["PlayerId"] = pd.to_numeric(df["PlayerId"], errors="coerce").astype("Int64")

    if "PlayerName" in df.columns:
        df["PlayerName"] = df["PlayerName"].astype(str).str.strip()

    if "Height" in df.columns:
        df["Height"] = pd.to_numeric(df["Height"], errors="coerce")

    if "Weight" in df.columns:
        df["Weight"] = pd.to_numeric(df["Weight"], errors="coerce")

    if "Position" in df.columns:
        df["Position"] = df["Position"].astype(str).str.strip()

    if "Dob" in df.columns:
        df["Dob"] = pd.to_datetime(df["Dob"], errors="coerce")
        df["BirthYear"] = df["Dob"].dt.year

    return df


def clean_player_position(players_typed: pd.DataFrame) -> pd.DataFrame:
    df = players_typed.copy()

    if "Position" not in df.columns:
        df["PositionClean"] = "Other"
        return df

    s = df["Position"].astype(str).str.strip().str.lower()
    s = s.replace({"nan": "", "none": "", "null": ""})

    def map_pos(x: str) -> str:
        x = x.strip()
        if x == "":
            return "Other"
        if "ruck" in x:
            return "Ruck"
        if "mid" in x:
            return "Midfield"
        if "for" in x:
            return "Forward"
        if "def" in x:
            return "Defender"
        return "Other"

    df["PositionClean"] = s.map(map_pos)
    return df

def add_players_derived_features(players_clean: pd.DataFrame) -> pd.DataFrame:
    df = players_clean.copy()

    if "Height" in df.columns and "Weight" in df.columns:
        h_m = df["Height"] / 100.0
        df["BMI"] = df["Weight"] / (h_m ** 2)

    return df


df_player = players_cast_types(df_player)
df_player = clean_player_position(df_player)
df_player = add_players_derived_features(df_player)
df_player = df_player.drop(columns=["Dob", "Position"])

df_player.head()

Unnamed: 0,PlayerId,PlayerName,Height,Weight,BirthYear,PositionClean,BMI
0,2020654979,Jake Aarts,177,75,1994,Forward,23.939481
1,2018655703,Ryan Abbott,200,100,1991,Ruck,25.0
2,2002652211,Gary Ablett,182,87,1984,Forward,26.264944
3,2014651814,Blake Acres,191,90,1995,Midfield,24.670376
4,2025654137,Jed Adams,196,91,2004,Defender,23.688047


## Data merge

In [6]:
df_final = df_stats.merge(df_player, on='PlayerId', how='left')
df_final = df_final.merge(df_game[['GameId', 'AwayTeam', 'MaxTemp', 'MinTemp', 'Rainfall']], on='GameId', how='left')

df_final.head()

Unnamed: 0,GameId,Year,Round,Team,PlayerId,PlayerName_x,GameNumber,Disposals,Kicks,Marks,...,PlayerName_y,Height,Weight,BirthYear,PositionClean,BMI,AwayTeam,MaxTemp,MinTemp,Rainfall
0,2012R0105,2012,Round 1,Adelaide,2011675768,Ian Callinan,4,18,14,5,...,Ian Callinan,171,70,1982,Forward,23.93899,Adelaide,28.2,19.7,0.0
1,2012R0105,2012,Round 1,Adelaide,2008681760,Patrick Dangerfield,65,25,13,3,...,Patrick Dangerfield,189,92,1990,Midfield,25.755158,Adelaide,28.2,19.7,0.0
2,2012R0105,2012,Round 1,Adelaide,2000686938,Michael Doughty,208,17,5,4,...,Michael Doughty,177,81,1979,Defender,25.854639,Adelaide,28.2,19.7,0.0
3,2012R0105,2012,Round 1,Adelaide,2006687579,Richard Douglas,96,19,14,6,...,Richard Douglas,181,79,1987,Midfield,24.114038,Adelaide,28.2,19.7,0.0
4,2012R0105,2012,Round 1,Adelaide,2010728130,Ricky Henderson,31,8,5,1,...,Ricky Henderson,188,89,1988,Midfield,25.181077,Adelaide,28.2,19.7,0.0


In [9]:
df_final.columns

Index(['GameId', 'Year', 'Round', 'Team', 'PlayerId', 'PlayerName_x',
       'GameNumber', 'Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals',
       'Behinds', 'HitOuts', 'Tackles', 'Rebounds', 'Inside50s', 'Clearances',
       'Clangers', 'Frees', 'FreesAgainst', 'BrownlowVotes',
       'ContestedPossessions', 'UncontestedPossessions', 'ContestedMarks',
       'MarksInside50', 'OnePercenters', 'Bounces', 'GoalAssists', '%Played',
       'Subs', 'PlayerName_y', 'Height', 'Weight', 'BirthYear',
       'PositionClean', 'BMI', 'AwayTeam', 'MaxTemp', 'MinTemp', 'Rainfall'],
      dtype='object')