In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
#plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import f1_score

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 학습 모델 저장을 위한 라이브러리
import pickle

### 프로젝트 셋팅

In [3]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = 'model/best_model_pitcher_data_4TH.dat'

# 교차검증 횟수
cv_count = 10

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
# 필요하다면 다른 것도 만들어주세요
f1_macro_list = []
f1_micro_list = []

# 학습 모델 이름
model_name_list = []

### 데이터 준비

In [5]:
df_pit = pd.read_csv('투수 스탯 데이터.csv')
df_sal = pd.read_csv('선수 연봉 구간 데이터(4구간).csv')

In [61]:
display(df_pit)
display(df_sal)

Unnamed: 0,2023_선수,2023_G,2023_GS,2023_GR,2023_GF,2023_CG,2023_SHO,2023_W,2023_L,2023_S,...,2024_BB,2024_HP,2024_SO,2024_BK,2024_WP,2024_ERA,2024_FIP,2024_WHIP,2024_WAR,2024_팀
0,페냐,32,32,0,0,0,0,11,11,0,...,20,0,29,0,1,6.27,5.83,1.63,0.26,한화 이글스
1,문동주,23,23,0,0,0,0,8,8,0,...,38,4,96,1,8,5.17,4.71,1.67,1.77,한화 이글스
2,산체스,24,24,0,0,0,0,7,8,0,...,21,10,56,1,2,4.22,3.82,1.52,1.48,한화 이글스
3,주현상,55,0,55,9,0,0,2,2,0,...,8,1,64,0,1,2.65,3.85,0.84,3.13,한화 이글스
4,이태양,50,12,38,6,0,0,3,3,0,...,2,0,2,0,0,11.57,9.43,2.04,-0.50,한화 이글스
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,김동규,3,1,2,0,0,0,0,1,0,...,2,0,0,0,0,54.00,22.80,9.00,-0.07,키움 히어로즈
200,김동혁,35,2,33,11,0,0,1,7,0,...,10,2,9,0,0,6.85,8.59,1.66,-0.72,키움 히어로즈
201,하영민,57,0,57,7,0,0,3,1,0,...,58,5,101,0,7,4.37,4.45,1.50,3.25,키움 히어로즈
202,양현,54,0,54,12,0,0,0,5,0,...,5,1,10,0,0,7.62,7.37,2.00,0.02,삼성 라이온즈


Unnamed: 0,선수,연봉(만원),WAR,WAR당 연봉,팀,pid,연도,연봉등급
0,류현진,200000.0,2.77,72322,한화 이글스,10590,2025,A
1,폰세,130000.0,5.86,19116,한화 이글스,16313,2025,A
2,플로리얼,97500.0,1.96,50070,한화 이글스,16312,2025,A
3,엄상백,90000.0,-0.11,-783140,한화 이글스,11318,2025,A
4,와이스,84000.0,3.42,24584,한화 이글스,16153,2025,A
...,...,...,...,...,...,...,...,...
841,김주훈,3000.0,-,-,키움 히어로즈,16129,2025,D
842,서유신,3000.0,-0.24,-12551,키움 히어로즈,15483,2025,D
843,박성빈,3000.0,-,-,키움 히어로즈,15479,2025,D
844,이우석,1500.0,-,-,키움 히어로즈,11360,2025,D


In [67]:
print(len(set(df_pit['pid']) & set(df_sal['pid'])))  # 교집합 개수
print(len(set(df_pit['pid']) - set(df_sal['pid'])))  # 스탯만 있고 연봉 없는 선수
print(len(set(df_sal['pid']) - set(df_pit['pid'])))  # 연봉만 있고 스탯 없는 선수

164
35
682


In [75]:
# 선수명까지 보고 싶다면
df_pit[df_pit['pid'].isin(lost_pids)][['pid', '2023_선수', '2024_선수']]

Unnamed: 0,pid,2023_선수,2024_선수
0,15146.0,페냐,페냐
2,15643.0,산체스,산체스
6,14621.0,한승주,한승주
10,10156.0,정우람,정우람
17,14619.0,남지민,남지민
21,13934.0,켈리,켈리
26,14588.0,김윤식,김윤식
32,14135.0,이상영,이상영
34,11170.0,윤호솔,윤호솔
36,11387.0,채지선,채원후


In [65]:
# 데이터 병합
df_merged = pd.merge(df_pit, df_sal[['pid', '연봉등급']], on='pid')
df_merged

Unnamed: 0,2023_선수,2023_G,2023_GS,2023_GR,2023_GF,2023_CG,2023_SHO,2023_W,2023_L,2023_S,...,2024_HP,2024_SO,2024_BK,2024_WP,2024_ERA,2024_FIP,2024_WHIP,2024_WAR,2024_팀,연봉등급
0,문동주,23,23,0,0,0,0,8,8,0,...,4,96,1,8,5.17,4.71,1.67,1.77,한화 이글스,B
1,주현상,55,0,55,9,0,0,2,2,0,...,1,64,0,1,2.65,3.85,0.84,3.13,한화 이글스,B
2,이태양,50,12,38,6,0,0,3,3,0,...,0,2,0,0,11.57,9.43,2.04,-0.50,한화 이글스,B
3,윤대경,47,0,47,10,0,0,5,1,0,...,1,7,0,0,10.57,6.30,2.09,-0.26,한화 이글스,C
4,김기중,37,6,31,6,0,0,1,3,0,...,6,39,0,2,6.56,5.89,1.86,0.33,한화 이글스,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,이종민,11,2,9,4,0,0,0,1,0,...,5,21,1,0,7.63,7.22,1.79,-0.12,키움 히어로즈,D
164,김동규,3,1,2,0,0,0,0,1,0,...,0,0,0,0,54.00,22.80,9.00,-0.07,키움 히어로즈,D
165,하영민,57,0,57,7,0,0,3,1,0,...,5,101,0,7,4.37,4.45,1.50,3.25,키움 히어로즈,B
166,양현,54,0,54,12,0,0,0,5,0,...,1,10,0,0,7.62,7.37,2.00,0.02,삼성 라이온즈,C


In [33]:
# 연봉 등급 인코딩
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_merged['연봉등급'])

In [37]:
print(df_pit.columns.tolist())

['2023_선수', '2023_G', '2023_GS', '2023_GR', '2023_GF', '2023_CG', '2023_SHO', '2023_W', '2023_L', '2023_S', '2023_HD', '2023_IP', '2023_ER', '2023_R', '2023_TBF', '2023_H', '2023_2B', '2023_3B', '2023_HR', '2023_BB', '2023_HP', '2023_SO', '2023_BK', '2023_WP', '2023_ERA', '2023_FIP', '2023_WHIP', '2023_WAR', '2023_팀', 'pid', '2024_선수', '2024_G', '2024_GS', '2024_GR', '2024_GF', '2024_CG', '2024_SHO', '2024_W', '2024_L', '2024_S', '2024_HD', '2024_IP', '2024_ER', '2024_R', '2024_TBF', '2024_H', '2024_2B', '2024_3B', '2024_HR', '2024_BB', '2024_HP', '2024_SO', '2024_BK', '2024_WP', '2024_ERA', '2024_FIP', '2024_WHIP', '2024_WAR', '2024_팀']


In [43]:
# 표준화를 위해 결과데이터를 제외한다.
X = df_merged.drop(['pid', '2023_선수', '2024_선수', '2023_팀', '2024_팀', '연봉등급'], axis=1)

# 표준화
scaler1 = StandardScaler()
scaler1.fit(X)

In [49]:
# pid, 선수이름 따로 저장해두기 
pids = df_merged['pid']
names = df_merged['2023_선수']

In [51]:
# y 저장
y = df_merged['연봉등급']

# 인코딩 전에 y 원본 백업
y_raw = df_merged['연봉등급'].copy()

In [53]:
player_info = pd.DataFrame({'pid': pids, '선수명': names})

In [None]:
# train test 분리
X_train, X_test, y_train, y_test, info_train, info_test = train_test_split(
    X, y_encoded, player_info, test_size=0.2, stratify=y_encoded, random_state=42
)