In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [2]:
# read a CSV into a dataframe (adjust filename as needed)
events_filename = './data/event.csv'  # or provide a full path

if not os.path.exists(events_filename):
    csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"{events_filename!r} not found and no CSV files in the current directory.")
    filename = csv_files[0]
    print(f"No file named 'data.csv' found. Using first CSV in cwd: {filename}")

events = pd.read_csv(events_filename)
print(f"Loaded {len(events)} rows and {len(events.columns)} columns from {events_filename}")
events.head()

Loaded 107738 rows and 14 columns from ./data/event.csv


  events = pd.read_csv(events_filename)


Unnamed: 0,type,inPlay,isStrike,isBall,pitcherName,catcherName,batterName,pitchCode,pitchType,velocity,coordX,coordY,runners,PAId
0,PITCH,False,False,True,徐若熙,蔣少宏,陳晨威,B,FF,145,71.11,-91.43,[],
1,PITCH,True,True,False,徐若熙,蔣少宏,陳晨威,H,FF,147,54.86,42.67,"[{'type': 'PA', 'runnerName': '陳晨威', 'isOut': ...",
2,PITCH,False,True,False,徐若熙,蔣少宏,林立,F,FF,145,54.86,22.35,[],
3,PITCH,False,True,False,徐若熙,蔣少宏,林立,SW,SL,135,-2.03,-56.89,[],
4,PITCH,False,False,True,徐若熙,蔣少宏,林立,B,SL,139,-34.54,-144.25,[],


In [3]:
# # read a CSV into a dataframe (adjust filename as needed)
# pa_filename = './data/pa.csv'  # or provide a full path

# if not os.path.exists(pa_filename):
#     csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
#     if not csv_files:
#         raise FileNotFoundError(f"{pa_filename!r} not found and no CSV files in the current directory.")
#     filename = csv_files[0]
#     print(f"No file named 'data.csv' found. Using first CSV in cwd: {filename}")



# pas = pd.read_csv(pa_filename, usecols=['inning', 'batterName', 'pitcherName', 'catcherName', 'paRound', 'pitchCodes', 'result', 'hardness', 'bases'])
# print(f"Loaded {len(pas)} rows and {len(pas.columns)} columns from {pa_filename}")
# pas.head()


In [4]:
# read a CSV into a dataframe (adjust filename as needed)
pa_filename = './data/pa_with_stats.csv'  # or provide a full path

if not os.path.exists(pa_filename):
    csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"{pa_filename!r} not found and no CSV files in the current directory.")
    filename = csv_files[0]
    print(f"No file named 'data.csv' found. Using first CSV in cwd: {filename}")


pas = pd.read_csv(pa_filename, usecols=['batterName', 'pitcherName', 'catcherName', 'bases', 'pitchCodes', 'result', 'hardness', 'AVG', 'OPS', 'whiff'])
print(f"Loaded {len(pas)} rows and {len(pas.columns)} columns from {pa_filename}")
pas.head()


Loaded 27600 rows and 10 columns from ./data/pa_with_stats.csv


Unnamed: 0,batterName,pitcherName,catcherName,bases,pitchCodes,result,hardness,AVG,OPS,whiff
0,陳晨威,徐若熙,蔣少宏,0,"['B', 'H']",GO,M,0.0,0.0,0.0
1,林立,徐若熙,蔣少宏,0,"['F', 'SW', 'B', 'B', 'S']",SO,,0.0,0.0,0.0
2,梁家榮,徐若熙,蔣少宏,0,"['S', 'F', 'B', 'H']",2B,H,0.0,0.0,0.0
3,廖健富,徐若熙,蔣少宏,2,"['B', 'H']",1B,H,0.0,0.0,0.0
4,朱育賢,徐若熙,蔣少宏,0,"['S', 'SW', 'H']",FO,M,0.0,0.0,0.0


# Filter out non-pitching events

In [5]:
events = events[events['type'] == 'PITCH']

# Pair PA and Events

In [6]:
# Pair plate-appearances (PA) from `pas`/`pa_filtered` with sequences of pitch `events` in `events_filtered`.
# Algorithm:
#  - Walk through events_filtered in order, accumulate pitches into a current PA.
#  - End a PA when: row['inPlay'] is True OR the row['runners'] contains a dict with type == 'PA' (robust end marker)
#  - Also force-end if batter/pitcher changes between consecutive rows.
#  - Build a pa_events dataframe, assign a sequential pa_seq per (batterName, pitcherName),
#    and then align with pas (or pa_filtered) by the same (batterName, pitcherName, pa_seq).
#
# Note: This cell uses variables already present in the notebook (events_filtered, pas or pa_filtered).
#       It does not re-import modules.

def _row_has_pa_marker(runners, batter_name):
    # runners may be a list of dicts or something else; be defensive
    if not runners:
        return False
    try:
        for r in runners:
            if isinstance(r, dict) and r.get('type') == 'PA':
                # optionally check runnerName matches batter_name if present
                rn = r.get('runnerName')
                if rn is None or rn == batter_name:
                    return True
    except Exception:
        return False
    return False

# ensure chronological order
events_seq = events.sort_index()

pa_events = []
current = None

for idx, row in events_seq.iterrows():
    b = row['batterName']
    p = row['pitcherName']

    # start new PA if none
    if current is None:
        current = {
            'batterName': b,
            'pitcherName': p,
            'start_idx': idx,
            'events_idx': [idx],
            'pitchCodes': [row.get('pitchCode')],
            'pitchTypes': [row.get('pitchType')],
            'velocities': [row.get('velocity')],
            'coordXs': [row.get('coordX')],
            'coordYs': [row.get('coordY')],
            'inPlay_flags': [row.get('inPlay')],
            'last_row': row
        }
        # if the single row already ends PA
        if row.get('inPlay') or _row_has_pa_marker(row.get('runners'), b):
            current['end_idx'] = idx
            pa_events.append(current)
            current = None
        continue

    # if batter or pitcher changed, finalize previous and start new
    if b != current['batterName'] or p != current['pitcherName']:
        current['end_idx'] = current['events_idx'][-1]
        pa_events.append(current)
        current = {
            'batterName': b,
            'pitcherName': p,
            'start_idx': idx,
            'events_idx': [idx],
            'pitchCodes': [row.get('pitchCode')],
            'pitchTypes': [row.get('pitchType')],
            'velocities': [row.get('velocity')],
            'coordXs': [row.get('coordX')],
            'coordYs': [row.get('coordY')],
            'inPlay_flags': [row.get('inPlay')],
            'last_row': row
        }
        if row.get('inPlay') or _row_has_pa_marker(row.get('runners'), b):
            current['end_idx'] = idx
            pa_events.append(current)
            current = None
        continue

    # otherwise append to current
    current['events_idx'].append(idx)
    current['pitchCodes'].append(row.get('pitchCode'))
    current['pitchTypes'].append(row.get('pitchType'))
    current['velocities'].append(row.get('velocity'))
    current['coordXs'].append(row.get('coordX'))
    current['coordYs'].append(row.get('coordY'))
    current['inPlay_flags'].append(row.get('inPlay'))
    current['last_row'] = row

    # end conditions
    if row.get('inPlay') or _row_has_pa_marker(row.get('runners'), b):
        current['end_idx'] = idx
        pa_events.append(current)
        current = None

# finalize any remaining current PA
if current is not None:
    current['end_idx'] = current['events_idx'][-1]
    pa_events.append(current)
    current = None

# build dataframe of pa events
pa_events_df = pd.DataFrame([{
    'batterName': e['batterName'],
    'pitcherName': e['pitcherName'],
    # 'start_idx': e['start_idx'],
    # 'end_idx': e['end_idx'],
    # 'events_idx': e['events_idx'],
    'pitchCodes_events': e['pitchCodes'],
    'pitchTypes_events': e['pitchTypes'],
    'velocities_events': e['velocities'],
    'coords_events': list(zip(e['coordXs'], e['coordYs'])),
    # 'inPlay_flags': e['inPlay_flags'],
    'n_pitches': len(e['events_idx'],)
} for e in pa_events])

# assign sequential PA index per (batter, pitcher) to allow pairing with pas
pa_events_df['pa_seq'] = pa_events_df.groupby(['batterName', 'pitcherName']).cumcount() + 1

# prepare pas (use pa_filtered if you prefer pairing only filtered PAs)
pas_seq = pas.copy()
pas_seq = pas_seq.sort_index()  # keep original order
pas_seq['pa_seq'] = pas_seq.groupby(['batterName', 'pitcherName']).cumcount() + 1

# merge on batterName, pitcherName, pa_seq
paired = pd.merge(pas_seq, pa_events_df,
                  on=['batterName', 'pitcherName', 'pa_seq'],
                  how='left',
                  suffixes=('_pa', '_events'))

# show some diagnostics
print(f"Built {len(pa_events_df)} PA event groups from {len(events_seq)} events")
print(f"pas has {len(pas_seq)} rows; merged result has {len(paired)} rows")
display(paired.head(50))
print(paired.keys())

Built 27444 PA event groups from 103620 events
pas has 27600 rows; merged result has 27600 rows


Unnamed: 0,batterName,pitcherName,catcherName,bases,pitchCodes,result,hardness,AVG,OPS,whiff,pa_seq,pitchCodes_events,pitchTypes_events,velocities_events,coords_events,n_pitches
0,陳晨威,徐若熙,蔣少宏,0,"['B', 'H']",GO,M,0.0,0.0,0.0,1,"[B, H]","[FF, FF]","[145, 147]","[(71.11, -91.43), (54.86, 42.67)]",2.0
1,林立,徐若熙,蔣少宏,0,"['F', 'SW', 'B', 'B', 'S']",SO,,0.0,0.0,0.0,1,"[F, SW, B, B, S]","[FF, SL, SL, FF, FF]","[145, 135, 139, 149, 150]","[(54.86, 22.35), (-2.03, -56.89), (-34.54, -14...",5.0
2,梁家榮,徐若熙,蔣少宏,0,"['S', 'F', 'B', 'H']",2B,H,0.0,0.0,0.0,1,"[S, F, B, H]","[CU, CU, CH, CH]","[121, 124, 132, 133]","[(-12.19, -26.41), (-60.95, -58.92), (95.49, 2...",4.0
3,廖健富,徐若熙,蔣少宏,2,"['B', 'H']",1B,H,0.0,0.0,0.0,1,"[B, H]","[FF, FF]","[146, 144]","[(89.4, 93.46), (22.35, 6.1)]",2.0
4,朱育賢,徐若熙,蔣少宏,0,"['S', 'SW', 'H']",FO,M,0.0,0.0,0.0,1,"[S, SW, H]","[FF, CU, FF]","[149, 124, 148]","[(-14.22, 4.06), (-10.16, -75.17), (2.03, -4.06)]",3.0
5,林承飛,徐若熙,蔣少宏,0,"['SW', 'SW', 'B', 'F', 'F', 'S']",SO,,0.0,0.0,0.0,1,"[SW, SW, B, F, F, S]","[FF, FF, FF, FF, SL, CU]","[150, 150, 153, 149, 134, 122]","[(-4.06, 67.05), (16.25, -16.25), (-83.3, 18.2...",6.0
6,邱丹,徐若熙,蔣少宏,0,['H'],GO,H,0.0,0.0,0.0,1,[H],[CH],[133],"[(22.35, 40.63)]",1.0
7,嚴宏鈞,徐若熙,蔣少宏,0,"['B', 'B', 'S', 'H']",GO,S,0.0,0.0,0.0,1,"[B, B, S, H]","[FF, SL, FF, FF]","[147, 136, 148, 147]","[(53.84, 90.44), (-75.37, -88.29), (-25.84, -1...",4.0
8,林禹叡,徐若熙,蔣少宏,0,"['B', 'B', 'F', 'S', 'SW']",SO,,0.0,0.0,0.0,1,"[B, B, F, S, SW]","[SL, FF, FF, FF, CU]","[134, 148, 148, 146, 122]","[(4.31, 157.2), (75.37, 68.91), (6.46, 34.45),...",5.0
9,陳晨威,徐若熙,蔣少宏,0,"['B', 'S', 'B', 'F', 'SW']",SO,,0.0,0.0,0.0,2,"[B, S, B, F, SW]","[CU, FF, FF, FF, CH]","[120, 148, 150, 151, 133]","[(-86.14, -58.14), (-8.61, -38.76), (-23.69, 1...",5.0


Index(['batterName', 'pitcherName', 'catcherName', 'bases', 'pitchCodes',
       'result', 'hardness', 'AVG', 'OPS', 'whiff', 'pa_seq',
       'pitchCodes_events', 'pitchTypes_events', 'velocities_events',
       'coords_events', 'n_pitches'],
      dtype='object')


# Pitchers and batters PA>50

In [7]:
batter_names = paired['batterName'].unique()
print(f"Found {len(batter_names)} unique batters")

batter_names_filtered = []  
for batter in batter_names:
    pas_batter = paired[paired['batterName'] == batter]
    if len(pas_batter) < 50:
        continue
    batter_names_filtered.append(batter)

print(f'Filtered to {len(batter_names_filtered)} batters with at least 50 PA each')

Found 173 unique batters
Filtered to 113 batters with at least 50 PA each


In [8]:
pitcher_names = paired['pitcherName'].unique()
print(f"Found {len(pitcher_names)} unique pitchers")

pitcher_names_filtered = []
for pitcher in pitcher_names:
    pas_pitcher = paired[paired['pitcherName'] == pitcher]
    if len(pas_pitcher) < 50:
        continue
    pitcher_names_filtered.append(pitcher)

print(f'Filtered to {len(pitcher_names_filtered)} pitchers with at least 50 PA each')

Found 168 unique pitchers
Filtered to 124 pitchers with at least 50 PA each


In [9]:
paired_filtered = paired[paired['batterName'].isin(batter_names_filtered) & paired['pitcherName'].isin(pitcher_names_filtered) ]
print(f"Filtered PA count: {len(paired_filtered)}")

# events_filtered = events[events['batterName'].isin(batter_names_filtered) & events['pitcherName'].isin(pitcher_names_filtered)]
# print(f"Filtered Event count: {len(events_filtered)}")

Filtered PA count: 26009


# Write csv

In [10]:
out_path = './data/paired_filtered.csv'
os.makedirs(os.path.dirname(out_path) or '.', exist_ok=True)
paired_filtered.to_csv(out_path, index=False, encoding='utf-8-sig')
print(f"Wrote {len(paired_filtered)} rows x {len(paired_filtered.columns)} cols to {out_path!r}")

Wrote 26009 rows x 16 cols to './data/paired_filtered.csv'
