In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 학습 모델 저장을 위한 라이브러리
import pickle

### 프로젝트 셋팅

In [3]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = 'model/best_model_pitcher_data.dat'
# 예측 결과를 저장할 파일 이름
prediction_path = 'model/pitcher_prediction.csv'

### 저장한 모델 객체 등을 복원한다.

In [5]:
with open(best_model_path, 'rb') as fp:
    loaded = pickle.load(fp)

# 추출
model = loaded['model']
scaler = loaded.get('scaler')

display(model)
display(scaler)

### 예측할 데이터를 준비한다.

In [18]:
# 예측할 데이터를 읽어온다.
df_pit = pd.read_csv('merged_pitchers_fixed.csv')
df_pit = df_pit.drop(columns=['연봉(만원)'])

df_pit

Unnamed: 0,pid,Name_x,포지션,팀,ERA,FIP,WHIP,SO_x,BB_x,HR_x,...,W,L,S,HD,pit_G,CG,SHO,GR,GF,WAR
0,10058.0,양현종,투수,KIA 타이거즈,3.840,4.165,1.295,131.0,44.5,17.0,...,20.0,16.0,0.0,0.0,58.0,3.0,0.0,0.0,3.0,4.265
1,10075.0,진해수,투수,LG 트윈스,4.930,5.680,1.785,17.0,10.0,3.5,...,2.0,1.0,0.0,7.0,73.0,0.0,0.0,73.0,6.0,0.005
2,10124.0,고효준,투수,SSG 랜더스,6.340,5.140,1.760,46.5,31.0,3.5,...,6.0,2.0,0.0,18.0,99.0,0.0,0.0,99.0,9.0,0.365
3,10126.0,김광현,투수,SSG 랜더스,4.230,4.645,1.415,136.5,71.5,17.5,...,21.0,18.0,0.0,0.0,61.0,0.0,0.0,0.0,0.0,4.195
4,10131.0,박종훈,투수,SSG 랜더스,6.565,6.535,1.685,45.0,40.5,7.5,...,3.0,10.0,0.0,0.0,28.0,0.0,0.0,3.0,1.0,0.340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,16153.0,와이스,투수,한화 이글스,3.730,3.650,1.160,98.0,29.0,8.0,...,5.0,5.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,3.160
373,16155.0,발라조빅,투수,두산 베어스,4.260,3.220,1.440,69.0,28.0,3.0,...,2.0,6.0,0.0,1.0,12.0,0.0,0.0,1.0,0.0,1.350
374,16160.0,에르난데스,투수,LG 트윈스,4.020,3.730,1.210,55.0,16.0,5.0,...,3.0,2.0,1.0,1.0,11.0,0.0,0.0,2.0,1.0,1.330
375,16161.0,라우어,투수,KIA 타이거즈,4.930,3.900,1.360,37.0,12.0,3.0,...,2.0,2.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.880


In [29]:
# 예측 데이터 전처리 
X_pit = df_pit.drop(['pid', 'Name_x', '포지션', '팀', 'WAR'], axis=1)

In [31]:
# 입력 데이터에 대한 표준화
X_pit_scaled  = scaler.transform(X_pit)
X_pit_scaled 

array([[-0.38153111, -0.03686994, -0.19907055, ..., -0.08266627,
        -0.86120428, -0.41202922],
       [-0.25743576, -0.03386496, -0.08263245, ..., -0.08266627,
         0.95711906, -0.20277961],
       [-0.09690876, -0.03493604, -0.08857317, ..., -0.08266627,
         1.60474107,  0.00647001],
       ...,
       [-0.3610383 , -0.03773275, -0.219269  , ..., -0.08266627,
        -0.8113872 , -0.55152897],
       [-0.25743576, -0.03739556, -0.18362468, ..., -0.08266627,
        -0.86120428, -0.62127884],
       [-0.2426354 , -0.03697903, -0.17887211, ..., -0.08266627,
        -0.86120428, -0.62127884]])

In [33]:
# 입력데이터를 test_X 변수에 담아준다.
test_X = X_pit_scaled

### 예측하고 저장한다.

In [36]:
# 예측한다.
y_pred = model.predict(test_X)
y_pred

array([1, 2, 3, 0, 0, 3, 1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 2, 2, 3, 2, 2, 2,
       2, 0, 1, 2, 2, 2, 2, 1, 2, 3, 3, 3, 4, 2, 1, 1, 4, 3, 2, 2, 2, 0,
       2, 4, 3, 2, 3, 2, 3, 4, 1, 4, 3, 1, 2, 2, 2, 3, 3, 3, 3, 2, 3, 4,
       3, 4, 3, 1, 3, 3, 2, 0, 0, 3, 3, 2, 2, 4, 2, 1, 2, 4, 4, 2, 1, 4,
       2, 3, 2, 2, 2, 3, 1, 4, 1, 4, 4, 3, 2, 3, 3, 2, 4, 4, 4, 4, 4, 4,
       4, 4, 3, 3, 4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 3, 3, 4, 4, 1, 4, 4, 2,
       4, 4, 2, 4, 2, 4, 4, 4, 3, 3, 3, 4, 2, 4, 2, 3, 2, 2, 2, 1, 3, 4,
       4, 4, 3, 3, 4, 4, 2, 3, 4, 3, 4, 2, 4, 3, 3, 4, 4, 3, 4, 3, 3, 4,
       4, 4, 4, 4, 1, 0, 1, 0, 4, 2, 3, 4, 1, 3, 4, 4, 4, 2, 3, 1, 3, 3,
       4, 4, 2, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 2, 2, 2, 4, 2, 4,
       2, 3, 4, 4, 4, 0, 4, 4, 2, 4, 0, 4, 4, 4, 4, 3, 3, 4, 4, 2, 4, 4,
       3, 3, 4, 4, 4, 3, 4, 4, 4, 4, 2, 3, 4, 4, 4, 3, 4, 2, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 1, 1, 1, 2, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 3, 2, 4,
       4, 4, 4, 4, 0, 1, 1, 2, 1, 2, 3, 4, 3, 4, 4,

In [46]:
# 예측 결과를 불러온 파일에 저장한다. 
df_pit['연봉 등급'] = y_pred

In [48]:
# 예측 결과 인코딩 하기
label_map_rev = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E'
}

# 해당 컬럼을 문자로 매핑
df_pit['연봉 등급'] = df_pit['연봉 등급'].map(label_map_rev)

In [52]:
df_pit.head()

Unnamed: 0,pid,Name_x,포지션,팀,ERA,FIP,WHIP,SO_x,BB_x,HR_x,...,L,S,HD,pit_G,CG,SHO,GR,GF,WAR,연봉 등급
0,10058.0,양현종,투수,KIA 타이거즈,3.84,4.165,1.295,131.0,44.5,17.0,...,16.0,0.0,0.0,58.0,3.0,0.0,0.0,3.0,4.265,B
1,10075.0,진해수,투수,LG 트윈스,4.93,5.68,1.785,17.0,10.0,3.5,...,1.0,0.0,7.0,73.0,0.0,0.0,73.0,6.0,0.005,C
2,10124.0,고효준,투수,SSG 랜더스,6.34,5.14,1.76,46.5,31.0,3.5,...,2.0,0.0,18.0,99.0,0.0,0.0,99.0,9.0,0.365,D
3,10126.0,김광현,투수,SSG 랜더스,4.23,4.645,1.415,136.5,71.5,17.5,...,18.0,0.0,0.0,61.0,0.0,0.0,0.0,0.0,4.195,A
4,10131.0,박종훈,투수,SSG 랜더스,6.565,6.535,1.685,45.0,40.5,7.5,...,10.0,0.0,0.0,28.0,0.0,0.0,3.0,1.0,0.34,A


In [54]:
# 칼럼명 변경
df_pit = df_pit.rename(columns={'Name_x': '선수'})

In [58]:
df_pit = df_pit[['선수', '팀', 'pid', '연봉 등급']].copy()

In [60]:
df_pit.head()

Unnamed: 0,선수,팀,pid,연봉 등급
0,양현종,KIA 타이거즈,10058.0,B
1,진해수,LG 트윈스,10075.0,C
2,고효준,SSG 랜더스,10124.0,D
3,김광현,SSG 랜더스,10126.0,A
4,박종훈,SSG 랜더스,10131.0,A


In [62]:
df_pit.to_csv(prediction_path, index=False, encoding='utf-8-sig')
print('저장완료')

저장완료
