In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
#plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
df_pit = pd.read_csv('2124_투수_스탯_데이터합본(칼럼추가.ver).csv')
df_sal = pd.read_csv('2021_2025_선수연봉_등급(순위).csv')
df_award = pd.read_csv('2020-2025년 투수 수상.csv')

In [3]:
df_award = df_award[df_award['연도'] != 2020]

In [4]:
new_row = {
    '연도': 2024,
    '선수이름': '하트',
    '소속팀': 'NC',
    '생년월일': '1992-03-07',
    '수상카테고리': '수비부문',
    '수상내역': '골든글러브_투수'
}

df_award = pd.concat([df_award, pd.DataFrame([new_row])], ignore_index=True)

In [5]:
# 1. 수상 선수 set 만들기 (연도, 선수명 기준)
award_set = set(zip(df_award['연도'], df_award['선수이름']))

In [6]:
# 2. 투수 데이터에 수상여부 컬럼 추가 (apply 버전)
df_pit['수상여부'] = df_pit.apply(
    lambda row: 1 if (row['연도'], row['선수']) in award_set else 0,
    axis=1
)

In [7]:
df_pit[df_pit['선수'] == '미란다']

Unnamed: 0,선수,포지션,WAR,G,GS,GR,GF,CG,SHO,W,...,rRA9pf,FIP,WHIP,팀,pid,연도,선발,중간계투,마무리,수상여부
31,미란다,투수,7.49,28,28,0,1,1,1,14,...,2.73,2.42,1.14,두산 베어스,14751,2021,1,0,0,1
358,미란다,투수,-0.14,3,3,0,0,0,0,0,...,9.21,8.86,2.61,두산 베어스,14751,2022,0,1,0,0


In [8]:
# 1. df_sal의 '연도'를 '연도_이전'으로 미리 복사
df_sal_ = df_sal.copy()
df_sal_['연도_이전'] = df_sal_['연도'] - 1

# 2. 스탯 연도와 연봉 연도_이전으로 병합 (필요한 컬럼만 남기고)
df_merge = pd.merge(
    df_pit,  # 21~24년 스탯
    df_sal_[['pid', '연도_이전', '연봉 총수령액(만원)', '연봉등급(만원)', '팀연봉순위', '전체연봉순위', '연봉등급(순위)']],
    left_on=['pid', '연도'],
    right_on=['pid', '연도_이전'],
    how='inner'
)

# 불필요한 '연도_이전' 컬럼만 드랍!
df_merge = df_merge.drop('연도_이전', axis=1)
df_merge

Unnamed: 0,선수,포지션,WAR,G,GS,GR,GF,CG,SHO,W,...,연도,선발,중간계투,마무리,수상여부,연봉 총수령액(만원),연봉등급(만원),팀연봉순위,전체연봉순위,연봉등급(순위)
0,임기영,투수,3.17,28,28,0,0,0,0,8,...,2021,1,0,0,0,14000,B,14,173,C
1,이의리,투수,2.76,19,19,0,0,0,0,4,...,2021,1,0,0,0,39000,B,8,85,C
2,정해영,투수,2.69,64,0,64,58,0,0,5,...,2021,0,0,1,0,17000,B,11,157,C
3,장현식,투수,2.56,69,0,69,5,0,0,1,...,2021,0,1,0,0,20000,B,10,134,C
4,윤중현,투수,1.38,30,13,17,6,0,0,5,...,2021,1,0,0,0,6500,C,29,313,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,김도빈,투수,-0.11,1,1,0,0,0,0,0,...,2024,0,1,0,0,3100,D,57,516,D
952,이충호,투수,-0.12,3,0,3,0,0,0,0,...,2024,0,1,0,0,3400,D,49,453,D
953,장지수,투수,-0.14,13,0,13,7,0,0,0,...,2024,0,1,0,0,3400,D,49,453,D
954,윤대경,투수,-0.26,7,0,7,1,0,0,0,...,2024,0,1,0,0,7000,C,28,264,C


In [9]:
df_merge = df_merge.dropna()
df_merge

Unnamed: 0,선수,포지션,WAR,G,GS,GR,GF,CG,SHO,W,...,연도,선발,중간계투,마무리,수상여부,연봉 총수령액(만원),연봉등급(만원),팀연봉순위,전체연봉순위,연봉등급(순위)
0,임기영,투수,3.17,28,28,0,0,0,0,8,...,2021,1,0,0,0,14000,B,14,173,C
1,이의리,투수,2.76,19,19,0,0,0,0,4,...,2021,1,0,0,0,39000,B,8,85,C
2,정해영,투수,2.69,64,0,64,58,0,0,5,...,2021,0,0,1,0,17000,B,11,157,C
3,장현식,투수,2.56,69,0,69,5,0,0,1,...,2021,0,1,0,0,20000,B,10,134,C
4,윤중현,투수,1.38,30,13,17,6,0,0,5,...,2021,1,0,0,0,6500,C,29,313,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,김도빈,투수,-0.11,1,1,0,0,0,0,0,...,2024,0,1,0,0,3100,D,57,516,D
952,이충호,투수,-0.12,3,0,3,0,0,0,0,...,2024,0,1,0,0,3400,D,49,453,D
953,장지수,투수,-0.14,13,0,13,7,0,0,0,...,2024,0,1,0,0,3400,D,49,453,D
954,윤대경,투수,-0.26,7,0,7,1,0,0,0,...,2024,0,1,0,0,7000,C,28,264,C


In [10]:
df_merge.to_csv('투수_연봉_학습데이터(ver.순위칼럼추가).csv', index=False)