In [19]:
# Imports & readDataSubset()
import pandas as pd
import numpy as np
import pyarrow.dataset as pads
import pyarrow.csv as pacsv
from SMT_Data_Starter_2025 import readDataSubset
import matplotlib.pyplot as plt
from IPython.display import HTML
import great_tables as gt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Just get the subsets out of the way
game_info_subset = readDataSubset('game_info')
game_events_subset = readDataSubset('game_events')
player_position_subset = readDataSubset('player_pos')
ball_position_subset = readDataSubset('ball_pos')

In [20]:
"""Contests.ipynb
Filter for all contests at 1st base
    (Ground balls fielded by infielders thrown to 1st)
Get all throws in sequence (contest_throws.csv)
    - Order in sequence
    - Acquire-Throw-Arrival timestamps
    - Infer throw target (intent)
Get contest results: (contest_results.csv)
    (Safe/Out/Advance) 

1. Get all ground balls
    Using game_events and ball_pos 
2. Get all bbe first acquired by infielders (i.e. fielded ground balls)
    Using game_events 
3. Filter all infielder throws to play_id's (ground balls first acquired by infielders)
4. Infer throw target (intent)
    (It's overwhelmingly unlikely that target isn't a base)
    (It's overwhelmingly unlikely that throw goes closer to another target; miss by >20°)
5. Filter all play_id's for ending with throw to 1st base (this may be after another throw)
    Using ball_pos 
    (Ignore bobbles; assume all throws are made)
    (Eliminate contests where runner is returning to base)
    (This probably doesn't happen because infielders acquired the ball)
6. Get order in sequence
7. Get results (Safe/Out/Advance)
    (Blocked include player interference?)
    Using player_pos
"""
pass

In [21]:
"""
Get all ground balls
"""
events = game_events_subset.to_table().to_pandas()
ball_pos = ball_position_subset.to_table().to_pandas()

from Helpers import is_in_infield

# Get hits and first bounce per play
hits = events[events["event_code"] == 4][["game_str", "play_id", "timestamp"]].rename(columns={"timestamp": "hit_time"})
bounces = events[events["event_code"] == 16][["game_str", "play_id", "timestamp"]].rename(columns={"timestamp": "bounce_time"})
first_bounce = bounces.sort_values("bounce_time").drop_duplicates(subset=["game_str", "play_id"], keep="first")

# Merge and ensure bounce is after hit
merged = hits.merge(first_bounce, on=["game_str", "play_id"], how="inner")
merged = merged[merged["bounce_time"] > merged["hit_time"]]

# Get ball position at first bounce
bounce_with_pos = merged.merge(
    ball_pos[["game_str", "play_id", "timestamp", "ball_position_x", "ball_position_y"]],
    left_on=["game_str", "play_id", "bounce_time"],
    right_on=["game_str", "play_id", "timestamp"],
    how="left")

# Filter to only bounces that landed in the infield
bounce_with_pos["infield_bounce"] = bounce_with_pos.apply(
    lambda row: is_in_infield(row["ball_position_x"], row["ball_position_y"]), axis=1)
bounce_with_pos = bounce_with_pos[bounce_with_pos["infield_bounce"]]

bounce_with_pos.head(3)

Unnamed: 0,game_str,play_id,hit_time,bounce_time,timestamp,ball_position_x,ball_position_y,infield_bounce
1,y1_d069_ACN_QEA,19,632253,632453,632453.0,20.21193,31.1913,True
3,y1_d069_ACN_QEA,24,824403,825053,825053.0,55.6212,60.432,True
4,y1_d069_ACN_QEA,46,1728503,1728603,1728603.0,2.9499,15.8268,True


In [22]:
"""
Get all bbe first acquired by infielders (i.e. fielded ground balls)
"""
# Get first acquisition per play
acquisitions = events[events["event_code"] == 2][["game_str", "play_id", "timestamp", "player_position"]]
first_acq = acquisitions.sort_values("timestamp").drop_duplicates(subset=["game_str", "play_id"], keep="first")

# Only keep plays where the first acquisition was by an infielder / pitcher / catcher
infield_positions = [1, 2, 3, 4, 5, 6]
first_acq["fielder_is_inf"] = first_acq["player_position"].isin(infield_positions)
first_acq = first_acq[first_acq["fielder_is_inf"]]

# Merge final valid plays
valid_bounces = bounce_with_pos.merge(
    first_acq[["game_str", "play_id"]],
    on=["game_str", "play_id"],
    how="inner"
).drop_duplicates(subset=["game_str", "play_id"])

groundballs_inf = valid_bounces[['game_str','play_id']]
groundballs_inf.head(3)

Unnamed: 0,game_str,play_id
0,y1_d069_ACN_QEA,19
1,y1_d069_ACN_QEA,46
2,y1_d069_ACN_QEA,49


In [23]:
"""
Filter all infielder throws to play_id's (ground balls first acquired by infielders)
"""
# All throws on play_id's (ground balls first acquired by infielders)
inf_thr = game_events_subset.to_table(columns=['game_str','play_id','at_bat','play_per_game','timestamp','player_position','event_code']).to_pandas()
inf_thr = pd.merge(groundballs_inf,inf_thr,how="left",on=['game_str','play_id'])
# Get when acquired
inf_thr['acq_ts'] = inf_thr.groupby(['game_str','play_id','player_position'])['timestamp'].shift(1)
# Get when arrived: catch or deflection
mask = inf_thr['event_code'].isin([2, 9, 10])
inf_thr['isin_2910'] = np.where(mask, inf_thr['timestamp'], np.nan)
inf_thr['arr_ts'] = inf_thr.groupby(['game_str','play_id'])['isin_2910'].bfill()
# Filter to throws
inf_thr = inf_thr[((inf_thr['event_code']==3))]

inf_thr = inf_thr.astype({'acq_ts': 'Int64', 'arr_ts': 'Int64'})
inf_thr = inf_thr[['game_str','play_id','timestamp','acq_ts','arr_ts']]
inf_thr = inf_thr.rename(columns={'timestamp':'thr_ts'})
inf_thr.head(3) # Notice there are some duplicate play_id's — double plays

Unnamed: 0,game_str,play_id,thr_ts,acq_ts,arr_ts
7,y1_d069_ACN_QEA,19,639703,638753,640703
16,y1_d069_ACN_QEA,46,1731653,1730253,1732503
25,y1_d069_ACN_QEA,49,1793003,1791503,1794153


In [None]:
"""
Infer throw target (intent)
1. Take ball position at throw_timestamp, 200ms after, and at arrival
2. Determine (logged) distance change to each base after 200ms to see which base got closer most
3. If 1st & 2nd closest are similar, use arrival location to decide

The reason we use two mechanisms is because
What if the person clearly threw to 1st/2nd/3rd but overthrew?
Exception is when you throw past a base but in that case the closer base is
Wayyy to close to overthrow & way closer than the farther base
y2_d072_DYE_YJD
66
"""
# We can see that all throws are at least 200ms
    # s = (inf_thr['arr_ts'].to_numpy(dtype='float64') -
    #     inf_thr['thr_ts'].to_numpy(dtype='float64'))
    # s = s[~np.isnan(s)]
    # plt.hist(s, bins='auto', alpha=0.7, edgecolor='black')
    # plt.xlim(0,1000)
    # plt.show()

## 1. Take ball position at throw_timestamp, 200ms after, and at arrival
ball_tts = ball_position_subset.to_table(columns=['game_str','play_id','timestamp','ball_position_x','ball_position_y','ball_position_z']).to_pandas()
# Filter to play_id
ball_tts = pd.merge(inf_thr,ball_tts,how="left",on=['game_str','play_id']) 
# Ball position 200ms after
ball_tts['aft200_x'] = ball_tts.groupby(['game_str','play_id'])['ball_position_x'].shift(-4)
ball_tts['aft200_y'] = ball_tts.groupby(['game_str','play_id'])['ball_position_y'].shift(-4)
ball_tts['aft200_z'] = ball_tts.groupby(['game_str','play_id'])['ball_position_z'].shift(-4)
# Filter to throws
temp = ball_tts[ball_tts['thr_ts']==ball_tts['timestamp']]
# Ball position at arrival
arr_pos = ball_tts[ball_tts['arr_ts']==ball_tts['timestamp']][['game_str','play_id','arr_ts','ball_position_x','ball_position_y']]
arr_pos = arr_pos.rename(columns={"ball_position_x":"arr_x","ball_position_y":"arr_y"
                                  # ,"ball_position_z":"arr_z" We don't consider z position because the ball may be caught
                                  })
ball_tts = pd.merge(temp,arr_pos,how="left",on=['game_str','play_id','arr_ts'])

## 2. Determine distance change to each base after 200ms to see which base got closer most 
# Determine distance to each base at throw_ts
ball_tts['dist1'] = np.log(np.sqrt((ball_tts['ball_position_x'] - 63.64)**2 + (ball_tts['ball_position_y'] - 63.64)**2 + (ball_tts['ball_position_z'] - 0)**2))
ball_tts['dist2'] = np.log(np.sqrt((ball_tts['ball_position_x'] - 0)**2 + (ball_tts['ball_position_y'] - 127.279)**2 + (ball_tts['ball_position_z'] - 0)**2))
ball_tts['dist3'] = np.log(np.sqrt((ball_tts['ball_position_x'] - (-63.64))**2 + (ball_tts['ball_position_y'] - 63.64)**2 + (ball_tts['ball_position_z'] - 0)**2))
ball_tts['disth'] = np.log(np.sqrt((ball_tts['ball_position_x'] - 0)**2 + (ball_tts['ball_position_y'] - 0)**2 + (ball_tts['ball_position_z'] - 0)**2))
# Distance after 200ms
ball_tts['dist1_aft200'] = np.log(np.sqrt((ball_tts['aft200_x'] - 63.64)**2 + (ball_tts['aft200_y'] - 63.64)**2 + (ball_tts['aft200_z'] - 0)**2))
ball_tts['dist2_aft200'] = np.log(np.sqrt((ball_tts['aft200_x'] - 0)**2 + (ball_tts['aft200_y'] - 127.279)**2 + (ball_tts['aft200_z'] - 0)**2))
ball_tts['dist3_aft200'] = np.log(np.sqrt((ball_tts['aft200_x'] - (-63.64))**2 + (ball_tts['aft200_y'] - 63.64)**2 + (ball_tts['aft200_z'] - 0)**2))
ball_tts['disth_aft200'] = np.log(np.sqrt((ball_tts['aft200_x'] - 0)**2 + (ball_tts['aft200_y'] - 0)**2 + (ball_tts['aft200_z'] - 0)**2))
# Distance change after 200ms
ball_tts['dist1_ch'] = ball_tts['dist1_aft200'] - ball_tts['dist1']
ball_tts['dist2_ch'] = ball_tts['dist2_aft200'] - ball_tts['dist2']
ball_tts['dist3_ch'] = ball_tts['dist3_aft200'] - ball_tts['dist3']
ball_tts['disth_ch'] = ball_tts['disth_aft200'] - ball_tts['disth']
# Determine which base the ball got closer most
cols = ['dist1_ch','dist2_ch','dist3_ch','disth_ch']
ball_tts['closermost']      = ball_tts[cols].idxmin(axis=1)
ball_tts['closermost_dist'] = ball_tts[cols].min(axis=1)
ball_tts['closernext']      = ball_tts[cols].apply(lambda row: row.nsmallest(2).index[-1], axis=1)
ball_tts['closernext_dist'] = ball_tts[cols].apply(lambda row: row.nsmallest(2).iloc[-1], axis=1)

## 3. If 1st & 2nd closest are similar, use arrival location to decide
# Distance to each base at arrival_ts
ball_tts['dist1_arr'] = np.sqrt((ball_tts['arr_x'] - 63.64)**2 + (ball_tts['arr_y'] - 63.64)**2)
ball_tts['dist2_arr'] = np.sqrt((ball_tts['arr_x'] - 0)**2 + (ball_tts['arr_y'] - 127.279)**2)
ball_tts['dist3_arr'] = np.sqrt((ball_tts['arr_x'] - (-63.64))**2 + (ball_tts['arr_y'] - 63.64)**2)
ball_tts['disth_arr'] = np.sqrt((ball_tts['arr_x'] - 0)**2 + (ball_tts['arr_y'] - 0)**2)
ball_tts['closest'] = ball_tts[['dist1_arr','dist2_arr','dist3_arr','disth_arr']].idxmin(axis=1)
# Rename
ball_tts['closermost'] = ball_tts['closermost'].str.replace(r'_ch$','',regex=True)
ball_tts['closernext'] = ball_tts['closernext'].str.replace(r'_ch$','',regex=True)
ball_tts['closest'] = ball_tts['closest'].str.replace(r'_arr$','',regex=True)
# Decide if similar
RATIO = 3
cond = ((ball_tts['closermost_dist'] < ball_tts['closernext_dist'] * RATIO)
        & (ball_tts['closest'] == 'disth'))
ball_tts['target_base'] = np.where(cond, ball_tts['closermost'], ball_tts['closest'])
ball_tts['target_base'] = ball_tts['target_base'].fillna(ball_tts['closest'])
# Rename
replace_map = {'dist1':'first','dist2':'second','dist3':'third','disth':'home'}
ball_tts['target_base'] = ball_tts['target_base'].replace(replace_map)

ball_tts = ball_tts[['game_str','play_id','thr_ts','acq_ts','arr_ts','target_base']]
ball_tts.head(10).T

  ball_tts['closest'] = ball_tts[['dist1_arr','dist2_arr','dist3_arr','disth_arr']].idxmin(axis=1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
game_str,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA,y1_d069_ACN_QEA
play_id,19,46,49,67,84,109,118,129,146,154
thr_ts,639703,1731653,1793003,2502353,3236794,4276194,4678044,5164144,5798994,6124194
acq_ts,638753,1730253,1791503,2501353,3235594,4275644,4677394,5163744,5797694,6123094
arr_ts,640703,1732503,1794153,2503603,3237544,4276744,4678794,5164744,5799594,6125044
target_base,second,first,first,first,first,third,first,first,first,first


In [25]:
"""
Filter all play_id's for ending with throw to 1st base (this may be after another throw)
   (Ignore bobbles; assume all throws are made)
   (Eliminate contests where runner is returning to base)
   (This probably doesn't happen because infielders acquired the ball)
"""
# Only keep play_id's that end with throw to first
filter_throw_to_first = ball_tts.groupby(by=['game_str','play_id']).last().reset_index()
filter_throw_to_first = filter_throw_to_first[filter_throw_to_first['target_base']=='first'][['game_str','play_id']]

# Filter
contests_thr = pd.merge(ball_tts, filter_throw_to_first, on=['game_str','play_id'])
contests_thr.iloc[18:23].T

Unnamed: 0,18,19,20,21,22
game_str,y1_d070_ACN_QEA,y1_d070_ACN_QEA,y1_d070_ACN_QEA,y1_d070_ACN_QEA,y1_d070_ACN_QEA
play_id,160,176,214,227,227
thr_ts,5291687,5891887,7004137,7480487,7481587
acq_ts,5290537,5890487,7003187,7479887,7481087
arr_ts,5292837,5892787,7004987,7481087,7482587
target_base,first,first,first,second,first


In [26]:
"""
Get order in sequence
"""
contests_thr = contests_thr.sort_values(['game_str','play_id'])
contests_thr['nth_throw'] = contests_thr.groupby(['game_str','play_id'])['thr_ts'].rank('min').astype('Int64')
contests_thr.head(2)

Unnamed: 0,game_str,play_id,thr_ts,acq_ts,arr_ts,target_base,nth_throw
111,y1_d001_CGA_QEA,21,562499,561799,563149,second,1
112,y1_d001_CGA_QEA,21,563799,563149,564699,first,2


In [27]:
"""
Get results (Safe/Out/Advance)
    1. Get the furthest base that the batter (tried to) reached in the play_id
    2. See if a runner exists on that base or the base(s) before
    3. Determine if runner advanced (if safe)

# Since it's a ground ball, there can't be the previous runner remaining on first base
# Unless the batter was tagged out first
# Which results in reached_base == 0

# We can see that here
mask = rnr_exists['player_position'].apply(
    lambda x: isinstance(x, (list, tuple, set)) and 11 in x
) & (rnr_exists['reached_base'] == 0)
result = rnr_exists[mask].head(20)
result

"""
from Helpers import to_list

## 1. Get the furthest base that the runner (tried to) reached in the play_id
# Filter play_id's to only contests
reachbs = player_position_subset.to_table(
   filter=pads.field('player_position') == 10,
   columns=['game_str','play_id','timestamp','player_position','field_x','field_y']).to_pandas()
reachbs = pd.merge(reachbs,filter_throw_to_first,how="right",on=['game_str','play_id'])
# Helper function
# (Reached base x if existed within RADIUS)
RADIUS = 5
def reached_base(x,y) -> int:
    if pd.isna(x) or pd.isna(y):
        return False
    first = (np.sqrt((x-63.64)**2 + (y-63.64)**2) < RADIUS)
    second = (np.sqrt((x)**2 + (y-127.279)**2) < RADIUS)
    third = (np.sqrt((x-(-63.64))**2 + (y-63.64)**2) < RADIUS)
    return 3 if third else (2 if second else (1 if first else 0))
# (This assumes no inside-the-park-homerun on an infield ground ball)
reachbs['reached_base'] = reachbs.apply(
    lambda row: reached_base(row['field_x'],row['field_y']), axis=1)
reachbs = reachbs.groupby(by=['game_str','play_id']).agg({'reached_base':"max"}).reset_index()

## 2. See if a runner exists on that base or the base(s) before
rnr_exists = player_position_subset.to_table(
    columns=['game_str','play_id','player_position'],
    filter=(pads.field('player_position').isin([10, 11, 12, 13]))
    ).to_pandas().groupby(['game_str','play_id'])['player_position'].unique().reset_index()
rnr_exists['play_id'] = rnr_exists['play_id'] - 1
# Join above
rnr_exists = pd.merge(rnr_exists, reachbs, how="right", left_on=['game_str','play_id'], right_on=['game_str','play_id'])
# Make sure player_position is a list
rnr_exists['player_position'] = rnr_exists['player_position'].apply(to_list)
# Helper function
# (If runner exists on reached_base or before in next play_id)
def safe(row):
    pos_raw = row['player_position']
    if isinstance(pos_raw, (list, tuple, set, np.ndarray)):
        positions = set(pos_raw)
    else:
        positions = set()
    rb = row['reached_base']
    if rb == 0: return False
    elif rb == 1: return 11 in positions
    elif rb == 2: return 11 in positions or 12 in positions
    elif rb == 3: return 11 in positions or 12 in positions or 13 in positions
    else:
        raise ValueError(f"Unexpected reached_base value: {rb} (row: {row})")
rnr_exists['safe'] = rnr_exists.apply(safe, axis=1)

## 3. Determine if runner advanced (if safe)
rnr_exists['advanced'] = rnr_exists.apply(
    lambda row: row['safe'] and (11 not in row['player_position']), axis=1)

# Cleanup
rnr_exists = rnr_exists[['game_str','play_id','safe','advanced']]
print(rnr_exists.groupby('safe').size())
print("NaN      ", rnr_exists['safe'].isna().sum())
pct_true1 = rnr_exists['safe'].mean() * 100
pct_true2 = rnr_exists['advanced'].mean() * 100
print(f"{pct_true1:.2f}% of fielded ground balls are safe")
print(f"{pct_true2:.2f}% of fielded ground balls result in runner advanced to next bases")

# Display advances
rnr_exists[(rnr_exists['safe']==True) & (rnr_exists['advanced']==True)].head(50)

safe
False    3374
True      408
dtype: int64
NaN       0
10.79% of fielded ground balls are safe
0.34% of fielded ground balls result in runner advanced to next bases


Unnamed: 0,game_str,play_id,safe,advanced
540,y1_d033_AVV_YJD,68,True,True
623,y1_d038_HMN_RZQ,18,True,True
849,y1_d047_VHS_YJD,271,True,True
943,y1_d050.5_STK_QEA,159,True,True
1019,y1_d051_UPU_YJD,203,True,True
1044,y1_d054_APZ_QEA,6,True,True
1286,y1_d064_IAQ_YJD,145,True,True
1407,y1_d067_OWH_QEA,251,True,True
2449,y2_d013_RQJ_YJD,183,True,True
2853,y2_d041_XLB_RZQ,129,True,True


In [28]:
# Export
contests_thr.to_csv('../CSV/contest_throws.csv', index=False)
rnr_exists.to_csv('../CSV/contest_results.csv',index=False)