In [1]:

from pybaseball import standings, batting_stats, pitching_stats
import pandas as pd

# 1. 팀 순위 데이터
print("=== 팀 순위 수집  ===")
standings_list = []

for year in range(2018, 2026):
    standings_data = standings(year)
    
    for division in standings_data:
        division['Year'] = year
        standings_list.append(division)

team_standings = pd.concat(standings_list, ignore_index=True)
team_standings.to_csv('team_standings.csv', index=False)
print(f"팀 순위: {len(team_standings)}개 저장")


# 2. 타자 성적 데이터
print("\n=== 타자 성적 수집  ===")
batting_list = []

for year in range(2018, 2026):
    batting = batting_stats(year)
    batting['Year'] = year
    batting_list.append(batting)

all_batting = pd.concat(batting_list, ignore_index=True)
all_batting.to_csv('batting_stats.csv', index=False)
print(f"타자: {len(all_batting)}개 저장")


# 3. 투수 성적 데이터
print("\n=== 투수 성적 수집  ===")
pitching_list = []

for year in range(2018, 2026):
    pitching = pitching_stats(year)
    pitching['Year'] = year
    pitching_list.append(pitching)

all_pitching = pd.concat(pitching_list, ignore_index=True)
all_pitching.to_csv('pitching_stats.csv', index=False)
print(f"투수: {len(all_pitching)}개 저장")

print("\n모든 데이터 수집 완료")
print(f"   - team_standings.csv")
print(f"   - batting_stats.csv")
print(f"   - pitching_stats.csv")



=== 팀 순위 수집  ===
팀 순위: 240개 저장

=== 타자 성적 수집  ===
타자: 1088개 저장

=== 투수 성적 수집  ===
투수: 397개 저장

모든 데이터 수집 완료
   - team_standings.csv
   - batting_stats.csv
   - pitching_stats.csv


In [5]:
pitching = pd.read_csv('./data/pitching_stats.csv')
batting = pd.read_csv('./data/batting_stats.csv')

In [6]:
print(pitching.shape)
print(pitching.columns)



(397, 394)
Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'W', 'L', 'WAR', 'ERA', 'G',
       ...
       'Stf+ FS', 'Loc+ FS', 'Pit+ FS', 'Stuff+', 'Location+', 'Pitching+',
       'Stf+ FO', 'Loc+ FO', 'Pit+ FO', 'Year'],
      dtype='str', length=394)


In [7]:
print(batting.shape)
print(batting.columns)


(1088, 321)
Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B',
       ...
       'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA', 'xSLG',
       'xwOBA', 'L-WAR', 'Year'],
      dtype='str', length=321)


In [None]:
import pandas as pd

def prepare_fangraphs_stats(path: str, year: int) -> pd.DataFrame:
    df = pd.read_csv(path)

    # 시즌 필터: Season 컬럼 우선, 없으면 Year 사용
    if "Season" in df.columns:
        df = df[df["Season"] == year].copy()
    elif "Year" in df.columns:
        df = df[df["Year"] == year].copy()
    else:
        raise ValueError(f"{path} 에 Season/Year 컬럼이 없습니다. 컬럼: {list(df.columns)[:20]} ...")

    # 이름 컬럼: Name -> name_key
    if "Name" not in df.columns:
        raise ValueError(f"{path} 에 Name 컬럼이 없습니다. 컬럼: {list(df.columns)[:20]} ...")

    df["name_key"] = df["Name"].astype(str).str.lower().str.strip()
    return df


In [None]:
import json
import re
import requests
import pandas as pd
from typing import List

# -----------------------------
# 1) 연도별 URL 분기
# -----------------------------
from typing import List

def build_usatoday_salary_urls(year: int) -> List[str]:
    urls = []

    # 2021은 연도 슬러그 없이 기본 경로
    if year == 2021:
        urls.append("https://databases.usatoday.com/mlb-salaries/")
        urls.append("https://databases.usatoday.com/mlb-salaries/page/1/")
        # 혹시 모르니 연도형도 fallback로 시도
        urls.append("https://databases.usatoday.com/mlb-salaries-2021/")
        return list(dict.fromkeys(urls))

    # 2023+ : major-league-baseball-salaries-{year}
    if year >= 2023:
        urls.append(f"https://databases.usatoday.com/major-league-baseball-salaries-{year}/")
        # fallback
        urls.append(f"https://databases.usatoday.com/mlb-salaries-{year}/")
        return list(dict.fromkeys(urls))

    # 2022 및 그 외(<=2022): mlb-salaries-{year}
    urls.append(f"https://databases.usatoday.com/mlb-salaries-{year}/")
    # fallback
    urls.append(f"https://databases.usatoday.com/major-league-baseball-salaries-{year}/")
    return list(dict.fromkeys(urls))



# -----------------------------
# 2) salary fetcher
# -----------------------------
def fetch_salary_data_usatoday(year: int, max_pages: int = 200) -> pd.DataFrame:
    headers = {"User-Agent": "Mozilla/5.0"}
    html = None
    used_url = None

    # 1) 페이지 HTML 확보
    for url in build_usatoday_salary_urls(year):
        r = requests.get(url, headers=headers, timeout=30)
        if r.status_code == 200:
            html = r.text
            used_url = url
            break

    if html is None:
        raise RuntimeError(f"{year} salary page not found")

    print(f"[OK] salary page: {used_url}")

    # 2) sitedata 추출
    m = re.search(r"var\s+sitedata\s*=\s*(\{.*?\});", html, flags=re.S)
    if not m:
        raise RuntimeError("sitedata not found")

    sitedata = json.loads(m.group(1))

    ajax_url = sitedata["ajax_url"]
    nonce = sitedata["ajax_nonce"]
    page_id = sitedata["pageID"]
    blog_id = sitedata.get("blogID", "")

    def encode_form(d: dict) -> str:
        from urllib.parse import quote_plus
        return "&".join(f"{k}={quote_plus(str(v))}" for k, v in d.items())

    # 3) pagination
    rows = []
    fields = None
    page = 1

    for _ in range(max_pages):
        payload = {
            "action": "cspFetchTable",
            "security": nonce,
            "pageID": page_id,
            "blogID": blog_id,
            "sortBy": "PK_ID",
            "sortOrder": "asc",
            "page": page,
            "searches": json.dumps({}),
            "heads": "true",
        }

        r = requests.post(
            ajax_url,
            headers={
                **headers,
                "Content-Type": "application/x-www-form-urlencoded",
                "Referer": used_url,
            },
            data=encode_form(payload),
            timeout=30,
        )
        r.raise_for_status()
        data = r.json()

        if not data.get("success"):
            break

        result = data["data"].get("Result", [])
        if not result:
            break

        rows.extend(result)

        if fields is None:
            fields = data["data"].get("Fields", [])

        page += 1

    df = pd.DataFrame(rows)

    # 컬럼 라벨 정리
    if fields:
        label_map = {f["Name"]: f.get("Label", f["Name"]) for f in fields}
        df = df.rename(columns=label_map)

    df["season"] = year
    return df


In [42]:
year = 2021
page_url = f"https://databases.usatoday.com/major-league-baseball-salaries-{year}/"
salary_df = fetch_salary_data_usatoday(year)

print(salary_df.shape)
print(salary_df.head())
print(salary_df.columns)

out_path = f"./data/mlb_salary_{year}.csv"
salary_df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("saved:", out_path)


[OK] salary page: https://databases.usatoday.com/mlb-salaries/
(898, 11)
   PK_ID              Player  Season     Team Position    Salary        Years  \
0      1         Ahmed, Nick    2021  Arizona       SS   8125000  4 (2020-23)   
1      2  Bumgarner, Madison    2021  Arizona       SP  17882892  5 (2020-24)   
2      3   Cabrera, Asdrubal    2021  Arizona       2B   1750000                
3      4       Calhoun, Kole    2021  Arizona       OF   8000000  2 (2020-21)   
4      5      Clarke, Taylor    2021  Arizona       SP    601800                

   Total value  Average annual  \
0   32500000.0         8125000   
1   85000000.0        17000000   
2          NaN         1750000   
3   16000000.0         8000000   
4          NaN          601800   

                                              Links  season  
0  [anchor link="https://google.com/" title="test"]    2021  
1  [anchor link="https://google.com/" title="test"]    2021  
2  [anchor link="https://google.com/" title="test

In [10]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd

DATA_DIR = "./data"
YEARS = [2021, 2022, 2023, 2024, 2025]

# -----------------------------
# 0) Name normalize (강화버전)
# -----------------------------
def normalize_name(name: str) -> str:
    if pd.isna(name):
        return None
    s = str(name).strip()

    # "Last, First" -> "First Last"
    if "," in s:
        last, first = s.split(",", 1)
        s = f"{first.strip()} {last.strip()}"

    # 악센트 제거 (José -> Jose)
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))

    s = s.lower().strip()

    # 점 제거 (a.j. -> aj)
    s = s.replace(".", "")

    # 특수문자 정리 (하이픈/어포스트로피는 유지해도 되지만 일단 공백만 정리)
    s = re.sub(r"\s+", " ", s)
    return s


# -----------------------------
# 1) Salary clean (연도별 CSV)
#    - 다양한 컬럼명(USA Today) 통일
# -----------------------------
def load_and_clean_salary(year: int) -> pd.DataFrame:
    path = os.path.join(DATA_DIR, f"mlb_salary_{year}.csv")
    sal = pd.read_csv(path)

    # USA Today raw 컬럼 케이스 대응
    rename_map = {
        "Player": "player",
        "Team": "team",
        "Position": "position",
        "Salary": "salary",
        "Average Annual": "aav",
        "Total value": "total_value",
        "Total Value": "total_value",
        "Years": "years",
        "PK_ID": "pk_id",
    }
    sal = sal.rename(columns={k: v for k, v in rename_map.items() if k in sal.columns})

    # 필수 컬럼 체크
    need = ["player", "team", "position", "salary"]
    missing = [c for c in need if c not in sal.columns]
    if missing:
        raise ValueError(f"[salary {year}] missing columns: {missing} / got={list(sal.columns)[:20]}")

    # 숫자형 변환
    for c in ["salary", "aav", "total_value"]:
        if c in sal.columns:
            sal[c] = pd.to_numeric(sal[c], errors="coerce")

    sal["season"] = year
    sal["name_key"] = sal["player"].apply(normalize_name)

    keep = ["season","player","team","position","name_key","salary"]
    for c in ["aav","total_value","years"]:
        if c in sal.columns:
            keep.append(c)

    sal = sal[keep].copy()
    sal = sal.dropna(subset=["name_key"])
    sal["position"] = sal["position"].astype(str).str.strip().str.upper()

    return sal


# -----------------------------
# 2) Stats load + clean (batting / pitching)
#    - 파일은 여러 연도 합쳐져 있음 -> year로 필터
#    - 컬럼명 통일: Season/Year 혼재 대비
# -----------------------------
def load_stats_all():
    bat = pd.read_csv(os.path.join(DATA_DIR, "batting_stats.csv"))
    pit = pd.read_csv(os.path.join(DATA_DIR, "pitching_stats.csv"))

    # 시즌 컬럼 통일 (Season 또는 Year)
    def ensure_season(df: pd.DataFrame) -> pd.DataFrame:
        if "season" not in df.columns:
            if "Season" in df.columns:
                df = df.rename(columns={"Season": "season"})
            elif "Year" in df.columns:
                df = df.rename(columns={"Year": "season"})
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        return df

    bat = ensure_season(bat)
    pit = ensure_season(pit)

    # 선수명 컬럼 통일 (Name)
    if "name_key" not in bat.columns:
        if "Name" in bat.columns:
            bat["name_key"] = bat["Name"].apply(normalize_name)
        elif "player" in bat.columns:
            bat["name_key"] = bat["player"].apply(normalize_name)
        else:
            raise ValueError("batting_stats: can't find Name/player column")

    if "name_key" not in pit.columns:
        if "Name" in pit.columns:
            pit["name_key"] = pit["Name"].apply(normalize_name)
        elif "player" in pit.columns:
            pit["name_key"] = pit["player"].apply(normalize_name)
        else:
            raise ValueError("pitching_stats: can't find Name/player column")

    # 중복 key 처리: (season, name_key) 기준으로 중복이 있으면 최신/첫 행만 남기거나 평균내기
    # 일단 "첫 행" 유지 (필요하면 mean 집계로 바꿀 수 있음)
    bat = bat.dropna(subset=["season","name_key"]).drop_duplicates(subset=["season","name_key"])
    pit = pit.dropna(subset=["season","name_key"]).drop_duplicates(subset=["season","name_key"])

    return bat, pit


# -----------------------------
# 3) 투/타 분리 규칙 (salary의 position 기준)
# -----------------------------
BATTER_POS = {"C","1B","2B","3B","SS","OF","DH","INF"}  # INF는 일단 타자로
PITCHER_POS = {"SP","RP","P","RHP","LHP"}              # RHP/LHP 포함

def split_salary_by_position(sal: pd.DataFrame):
    pos = sal["position"].fillna("").astype(str).str.upper().str.strip()

    is_pitcher = pos.isin(PITCHER_POS) | pos.str.contains("P")
    # 예외: DH/SP 같은 혼합 표기는 투수로 보내지 말고 따로 빼거나 규칙 추가
    # 여기선 "P 들어가면 투수"가 강해서 DH/SP 등은 투수로 갈 수 있음 -> 아래 예외처리
    mix_mask = pos.str.contains("/")  # DH/SP, DH/RHP 같은 것들
    # 혼합 표기는 우선 제외(별도 확인)
    mixed = sal[mix_mask].copy()

    pitchers = sal[is_pitcher & (~mix_mask)].copy()
    batters  = sal[(~is_pitcher) & (~mix_mask)].copy()

    # INF는 타자쪽으로 강제
    batters = pd.concat([batters, sal[pos.eq("INF") & (~mix_mask)]], ignore_index=True)

    return batters, pitchers, mixed


# -----------------------------
# 4) 연도별 merge + 저장 + 전체 합본 저장
# -----------------------------
def merge_salary_stats_by_year(years=YEARS):
    batting_all, pitching_all = load_stats_all()

    out_bat_yearly = []
    out_pit_yearly = []
    out_mixed_yearly = []

    for y in years:
        sal = load_and_clean_salary(y)
        sal_bat, sal_pit, sal_mix = split_salary_by_position(sal)

        bat_y = batting_all[batting_all["season"].astype("Int64") == y].copy()
        pit_y = pitching_all[pitching_all["season"].astype("Int64") == y].copy()

        # merge (inner)
        merged_bat = sal_bat.merge(bat_y, on=["season","name_key"], how="inner", suffixes=("_salary","_bat"))
        merged_pit = sal_pit.merge(pit_y, on=["season","name_key"], how="inner", suffixes=("_salary","_pit"))

        # match rate 로깅
        bat_rate = merged_bat["name_key"].nunique() / max(1, sal_bat["name_key"].nunique())
        pit_rate = merged_pit["name_key"].nunique() / max(1, sal_pit["name_key"].nunique())

        print(f"[{y}] salary_bat={sal_bat.shape} merged_bat={merged_bat.shape} match={bat_rate:.3f}")
        print(f"[{y}] salary_pit={sal_pit.shape} merged_pit={merged_pit.shape} match={pit_rate:.3f}")
        if len(sal_mix):
            print(f"[{y}] mixed_position_rows={sal_mix.shape[0]} (saved separately)")

        # 저장(연도별)
        path_bat = os.path.join(DATA_DIR, f"mlb_batter_salary_stats_{y}.csv")
        path_pit = os.path.join(DATA_DIR, f"mlb_pitcher_salary_stats_{y}.csv")
        merged_bat.to_csv(path_bat, index=False, encoding="utf-8-sig")
        merged_pit.to_csv(path_pit, index=False, encoding="utf-8-sig")

        out_bat_yearly.append(merged_bat)
        out_pit_yearly.append(merged_pit)
        out_mixed_yearly.append(sal_mix.assign(season=y))

    # 전체 합본
    bat_all = pd.concat(out_bat_yearly, ignore_index=True) if out_bat_yearly else pd.DataFrame()
    pit_all = pd.concat(out_pit_yearly, ignore_index=True) if out_pit_yearly else pd.DataFrame()
    mix_all = pd.concat(out_mixed_yearly, ignore_index=True) if out_mixed_yearly else pd.DataFrame()

    bat_all.to_csv(os.path.join(DATA_DIR, "mlb_batter_salary_stats_2021_2025.csv"), index=False, encoding="utf-8-sig")
    pit_all.to_csv(os.path.join(DATA_DIR, "mlb_pitcher_salary_stats_2021_2025.csv"), index=False, encoding="utf-8-sig")
    mix_all.to_csv(os.path.join(DATA_DIR, "mlb_salary_mixed_positions_2021_2025.csv"), index=False, encoding="utf-8-sig")

    print("[DONE] batter_all:", bat_all.shape, "pitcher_all:", pit_all.shape, "mixed:", mix_all.shape)

    return bat_all, pit_all, mix_all


# 실행
bat_all, pit_all, mix_all = merge_salary_stats_by_year()


  bat["name_key"] = bat["Name"].apply(normalize_name)
  pit["name_key"] = pit["Name"].apply(normalize_name)


[2021] salary_bat=(429, 8) merged_bat=(126, 328) match=0.304
[2021] salary_pit=(483, 8) merged_pit=(39, 401) match=0.081
[2021] mixed_position_rows=1 (saved separately)
[2022] salary_bat=(437, 9) merged_bat=(119, 329) match=0.276
[2022] salary_pit=(542, 9) merged_pit=(43, 402) match=0.079
[2022] mixed_position_rows=1 (saved separately)
[2023] salary_bat=(431, 9) merged_bat=(118, 329) match=0.274
[2023] salary_pit=(513, 9) merged_pit=(43, 402) match=0.084
[2024] salary_bat=(470, 9) merged_bat=(127, 329) match=0.291
[2024] salary_pit=(525, 9) merged_pit=(58, 402) match=0.111
[2024] mixed_position_rows=1 (saved separately)
[2025] salary_bat=(463, 9) merged_bat=(139, 329) match=0.318
[2025] salary_pit=(521, 9) merged_pit=(51, 402) match=0.098
[2025] mixed_position_rows=1 (saved separately)
[DONE] batter_all: (629, 329) pitcher_all: (234, 402) mixed: (4, 9)


In [69]:
df = pd.read_csv("./data/mlb_salary_mixed_positions_2021_2025.csv")

In [70]:
df.isna().sum().sort_values(ascending=False)

aav            1
season         0
player         0
team           0
position       0
name_key       0
salary         0
total_value    0
years          0
dtype: int64

In [71]:
df.columns

Index(['season', 'player', 'team', 'position', 'name_key', 'salary',
       'total_value', 'years', 'aav'],
      dtype='str')

In [11]:
import re
import glob
import numpy as np
import pandas as pd

DATA_DIR = "./data"

# -----------------------------
# 0) Helpers: name key normalize
# -----------------------------
def normalize_name_key(s: str) -> str:
    """이름 매칭용 key: 소문자 + 특수문자/점 제거 + 공백 정리"""
    if pd.isna(s):
        return np.nan
    s = str(s).strip().lower()
    s = s.replace("\u00a0", " ")          # NBSP
    s = re.sub(r"\.", "", s)             # 점 제거 (A.J. -> AJ)
    s = re.sub(r"[^a-z\s\-']", " ", s)   # 알파벳/공백/하이픈/어포스트로피만
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_salary_player_to_key(player: str) -> str:
    """
    USA Today salary는 'Last, First' 형태가 많음.
    -> 'first last'로 바꾼 뒤 normalize_name_key 적용
    """
    if pd.isna(player):
        return np.nan
    player = str(player).strip()
    if "," in player:
        last, first = player.split(",", 1)
        player = f"{first.strip()} {last.strip()}"
    return normalize_name_key(player)

# -----------------------------
# 1) Salary clean (usatoday csv)
# -----------------------------
def clean_salary_df(raw: pd.DataFrame, year: int) -> pd.DataFrame:
    df = raw.copy()

    # 컬럼 후보들(연도별/수집별로 조금 다를 수 있어서)
    colmap = {
        "Player": "player",
        "Team": "team",
        "Position": "position",
        "Salary": "salary",
        "Average Annual": "aav",
        "Total value": "total_value",
        "Total Value": "total_value",
        "Years": "years",
    }
    df = df.rename(columns={k: v for k, v in colmap.items() if k in df.columns})

    must_have = ["player", "team", "position", "salary"]
    missing = [c for c in must_have if c not in df.columns]
    if missing:
        raise ValueError(f"[salary {year}] missing columns: {missing} / columns={df.columns.tolist()[:30]}")

    df["season"] = int(year)

    # 숫자 변환
    for c in ["salary", "aav", "total_value"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # name_key
    df["name_key"] = df["player"].apply(normalize_salary_player_to_key)

    keep = ["season", "player", "team", "position", "name_key", "salary"]
    for c in ["aav", "total_value", "years"]:
        if c in df.columns:
            keep.append(c)

    df = df[keep].dropna(subset=["name_key"])
    df = df.drop_duplicates(subset=["season", "name_key", "team", "position"], keep="first")
    return df

# -----------------------------
# 2) Stats load & prepare
# -----------------------------
def _pick_season_col(df: pd.DataFrame) -> str:
    """스탯 df에서 시즌 컬럼 자동 탐색 (Season or Year)"""
    for c in ["Season", "season", "Year", "year"]:
        if c in df.columns:
            return c
    raise ValueError(f"season column not found in stats df. columns={df.columns.tolist()[:30]}")

def prepare_stats_df(stats_raw: pd.DataFrame, kind: str) -> pd.DataFrame:
    """
    kind: 'batting' or 'pitching'
    - Name -> player
    - 시즌 컬럼 정규화 -> season(int)
    - name_key 생성
    """
    df = stats_raw.copy()

    # FanGraphs 스타일: Name, Season/Year
    if "Name" in df.columns and "player" not in df.columns:
        df = df.rename(columns={"Name": "player"})
    if "player" not in df.columns:
        raise ValueError(f"[{kind}] player column not found. columns={df.columns.tolist()[:30]}")

    season_col = _pick_season_col(df)
    df["season"] = pd.to_numeric(df[season_col], errors="coerce").astype("Int64")

    df["name_key"] = df["player"].apply(normalize_name_key)

    # season, name_key 기준으로 중복 제거 (팀이 여러개면 남길 기준은 상황에 따라 다름)
    # 여기서는 같은 시즌 동일 선수 중복이 있으면 첫 행 유지
    df = df.dropna(subset=["season", "name_key"]).copy()
    df["season"] = df["season"].astype(int)
    df = df.drop_duplicates(subset=["season", "name_key"], keep="first")

    return df

# -----------------------------
# 3) Position split
# -----------------------------
BATTER_POS = {"C","1B","2B","3B","SS","OF","DH","LF","CF","RF","INF","UTIL"}  # salary에 이런 케이스도 있음
PITCHER_POS = {"P","SP","RP","LHP","RHP"}

def classify_role_from_position(pos: str) -> str:
    if pd.isna(pos):
        return "unknown"
    pos = str(pos).strip().upper()
    # 혼합 표기(DH/SP 등) 처리
    if any(p in pos for p in ["SP","RP","P","LHP","RHP"]):
        # 투수 표기가 하나라도 들어있으면 투수로
        return "pitcher"
    if any(p in pos for p in ["C","1B","2B","3B","SS","OF","DH","LF","CF","RF","INF","UTIL"]):
        return "batter"
    return "unknown"

# -----------------------------
# 4) Merge functions
# -----------------------------
def merge_salary_with_stats(salary_df: pd.DataFrame, batting_df: pd.DataFrame, pitching_df: pd.DataFrame):
    sal = salary_df.copy()
    sal["role"] = sal["position"].apply(classify_role_from_position)

    bat_sal = sal[sal["role"]=="batter"].copy()
    pit_sal = sal[sal["role"]=="pitcher"].copy()

    merged_bat = bat_sal.merge(
        batting_df,
        on=["season", "name_key"],
        how="inner",
        suffixes=("_salary", "_bat")
    )
    merged_pit = pit_sal.merge(
        pitching_df,
        on=["season", "name_key"],
        how="inner",
        suffixes=("_salary", "_pit")
    )

    # 칼럼 충돌 정리: stats 쪽 player 컬럼이 남아있으면 Salary player를 우선
    if "player_salary" in merged_bat.columns:
        merged_bat["player"] = merged_bat["player_salary"]
    if "player_salary" in merged_pit.columns:
        merged_pit["player"] = merged_pit["player_salary"]

    return merged_bat, merged_pit

# -----------------------------
# 5) Batch pipeline (2021-2025)
# -----------------------------
def run_batch_merge(
    years=range(2021, 2026),
    salary_pattern=f"{DATA_DIR}/mlb_salary_{{year}}.csv",
    batting_path=f"{DATA_DIR}/batting_stats.csv",
    pitching_path=f"{DATA_DIR}/pitching_stats.csv",
    out_prefix=f"{DATA_DIR}/mlb"
):
    # stats load once
    batting_raw = pd.read_csv(batting_path)
    pitching_raw = pd.read_csv(pitching_path)

    batting_df = prepare_stats_df(batting_raw, kind="batting")
    pitching_df = prepare_stats_df(pitching_raw, kind="pitching")

    all_salary_clean = []
    all_bat_merged = []
    all_pit_merged = []

    for y in years:
        sal_path = salary_pattern.format(year=y)
        raw = pd.read_csv(sal_path)
        sal_clean = clean_salary_df(raw, year=y)

        merged_bat, merged_pit = merge_salary_with_stats(sal_clean, batting_df, pitching_df)

        # match rate (대략적인 진단용)
        bat_rate = merged_bat["name_key"].nunique() / max(1, sal_clean[sal_clean["position"].apply(classify_role_from_position)=="batter"]["name_key"].nunique())
        pit_rate = merged_pit["name_key"].nunique() / max(1, sal_clean[sal_clean["position"].apply(classify_role_from_position)=="pitcher"]["name_key"].nunique())

        # save per-year
        out_salary_clean = f"{out_prefix}_salary_clean_{y}.csv"
        out_bat = f"{out_prefix}_batter_salary_stats_{y}.csv"
        out_pit = f"{out_prefix}_pitcher_salary_stats_{y}.csv"

        sal_clean.to_csv(out_salary_clean, index=False, encoding="utf-8-sig")
        merged_bat.to_csv(out_bat, index=False, encoding="utf-8-sig")
        merged_pit.to_csv(out_pit, index=False, encoding="utf-8-sig")

        print(f"[{y}] salary_clean={sal_clean.shape} "
              f"bat_merged={merged_bat.shape} bat_match≈{bat_rate:.3f} "
              f"pit_merged={merged_pit.shape} pit_match≈{pit_rate:.3f} "
              f"saved: {out_bat}, {out_pit}")

        all_salary_clean.append(sal_clean)
        all_bat_merged.append(merged_bat)
        all_pit_merged.append(merged_pit)

    salary_all = pd.concat(all_salary_clean, ignore_index=True)
    bat_all = pd.concat(all_bat_merged, ignore_index=True)
    pit_all = pd.concat(all_pit_merged, ignore_index=True)

    out_salary_all = f"{out_prefix}_salary_clean_{years[0]}_{years[-1]}.csv"
    out_bat_all = f"{out_prefix}_batter_salary_stats_{years[0]}_{years[-1]}.csv"
    out_pit_all = f"{out_prefix}_pitcher_salary_stats_{years[0]}_{years[-1]}.csv"

    salary_all.to_csv(out_salary_all, index=False, encoding="utf-8-sig")
    bat_all.to_csv(out_bat_all, index=False, encoding="utf-8-sig")
    pit_all.to_csv(out_pit_all, index=False, encoding="utf-8-sig")

    print(f"[DONE] salary_all={salary_all.shape} bat_all={bat_all.shape} pit_all={pit_all.shape}")
    print(f"saved: {out_salary_all}\n       {out_bat_all}\n       {out_pit_all}")

    return salary_all, bat_all, pit_all

# 실행
salary_all, batter_all, pitcher_all = run_batch_merge(years=list(range(2021, 2026)))


[2021] salary_clean=(898, 8) bat_merged=(126, 331) bat_match≈0.306 pit_merged=(39, 404) pit_match≈0.081 saved: ./data/mlb_batter_salary_stats_2021.csv, ./data/mlb_pitcher_salary_stats_2021.csv
[2022] salary_clean=(971, 9) bat_merged=(118, 332) bat_match≈0.276 pit_merged=(44, 405) pit_match≈0.081 saved: ./data/mlb_batter_salary_stats_2022.csv, ./data/mlb_pitcher_salary_stats_2022.csv
[2023] salary_clean=(944, 9) bat_merged=(121, 332) bat_match≈0.281 pit_merged=(43, 405) pit_match≈0.084 saved: ./data/mlb_batter_salary_stats_2023.csv, ./data/mlb_pitcher_salary_stats_2023.csv
[2024] salary_clean=(952, 9) bat_merged=(124, 332) bat_match≈0.291 pit_merged=(58, 405) pit_match≈0.111 saved: ./data/mlb_batter_salary_stats_2024.csv, ./data/mlb_pitcher_salary_stats_2024.csv
[2025] salary_clean=(954, 9) bat_merged=(137, 332) bat_match≈0.319 pit_merged=(51, 405) pit_match≈0.098 saved: ./data/mlb_batter_salary_stats_2025.csv, ./data/mlb_pitcher_salary_stats_2025.csv
[DONE] salary_all=(4719, 9) bat_all

In [None]:
bat = pd.read_csv("./data/mlb_batter_salary_stats_2021_2025.csv")
pit = pd.read_csv("./data/mlb_pitcher_salary_stats_2021_2025.csv")

In [None]:
cols = bat.columns.to_list()
print(cols)


Index(['season', 'player_salary', 'team', 'position', 'name_key', 'salary',
       'total_value', 'years', 'role', 'IDfg',
       ...
       'Events', 'CStr%', 'CSW%', 'xBA', 'xSLG', 'xwOBA', 'L-WAR', 'Year',
       'player', 'aav'],
      dtype='str', length=332)

In [75]:
pit.columns

Index(['season', 'player_salary', 'team', 'position', 'name_key', 'salary',
       'total_value', 'years', 'role', 'IDfg',
       ...
       'Pit+ FS', 'Stuff+', 'Location+', 'Pitching+', 'Stf+ FO', 'Loc+ FO',
       'Pit+ FO', 'Year', 'player', 'aav'],
      dtype='str', length=405)