In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import f1_score

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 회귀
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle
import joblib

In [2]:
df = pd.read_csv('baseball_player_data.csv')
#df

In [3]:
# 연봉(만원)이 0인 행을 제거
df = df[df['연봉(만원)'] != 0]
df

Unnamed: 0,Name_x,포지션,fid_G,fid_GS,fid_IP,TC,PO,Ass,E,F%,...,비율.1,비율.2,비율.3,비율.4,비율.5,WAR▼_y,Name_y,연봉(만원),WAR,WAR당 연봉
0,김선빈,1루수,1,0,1.0,2,2,0,0,100.0,...,0.000,0.000,0.000,0.000,0.0,0.00,김선빈,60000,2.11,28379
1,안치홍,1루수,37,34,293.0,268,240,24,4,98.5,...,0.000,0.000,0.000,0.000,0.0,0.00,안치홍,50000,1.97,25319
2,최정,1루수,1,0,1.0,0,0,0,0,0.0,...,0.000,0.000,0.000,0.000,0.0,0.00,최정,100000,4.68,21383
3,이원석,1루수,12,12,87.0,91,85,4,2,97.8,...,0.000,0.000,0.000,0.000,0.0,0.00,이원석,40000,-0.47,-84662
4,최주환,1루수,122,116,1027.2,972,906,60,6,99.4,...,0.325,0.390,0.715,0.111,86.1,-0.03,최주환,65000,0.35,184816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,박민준,포수,4,1,10.0,10,10,0,0,100.0,...,0.000,0.000,0.000,-0.190,-144.6,-0.14,박민준,3000,-0.13,-22413
936,서동욱,포수,32,5,95.0,86,75,10,1,98.8,...,0.148,0.154,0.302,-0.052,-37.8,-0.45,서동욱,3300,-0.48,-6922
937,류현준,포수,1,0,3.0,4,3,1,0,100.0,...,0.667,1.000,1.667,0.487,380.3,0.12,류현준,3000,0.15,19631
938,백두산,포수,0,0,0.0,0,0,0,0,0.0,...,0.000,0.000,0.000,-0.190,-127.9,-0.02,백두산,3000,-0.02,-150583


### 프로젝트 셋팅

In [5]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = 'model/best_model_regression.dat'

# 교차검증 횟수
cv_count = 10

# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=1)

# 평가 결과를 담을 리스트
mse_score_list = []

# 학습 모델 이름
model_name_list = []

### 데이터 준비

In [7]:
df = df.drop(['Name_x', 'Name_y'], axis=1)

# object 타입(문자형) 컬럼만 뽑기
obj_cols = df.select_dtypes(include=['object']).columns

# 각각의 object 컬럼에 대해 LabelEncoder 적용
le = LabelEncoder()
for col in obj_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [8]:
# 입력과 결과로 나눈다.
X = df.drop('연봉(만원)', axis=1)
y = df['연봉(만원)']

# 전처리
scaler1 = StandardScaler()
scaler1.fit(X)
X2 = scaler1.transform(X)

# 학습할 데이터를 변수에 담아준다.
train_X = X2
train_y = y

### 기본 모델 사용하기

In [10]:
# GradientBoost
gb_basic_model = GradientBoostingRegressor()

# 교차 검증을 수행한다
r1 = cross_val_score(gb_basic_model, train_X, train_y, scoring='neg_mean_squared_error', cv=kfold)

# 평가 결과를 담아준다.
mse_score_list.append(abs(r1.mean()))

# 학습 모델 이름을 담아준다.
model_name_list.append("GradientBoost Basic")

print(mse_score_list)
print(model_name_list)

[418468135.55954295]
['GradientBoost Basic']


In [11]:
# LGBM
lgbm_basic_model = LGBMRegressor(verbose=-1)

# 교차 검증을 수행한다
r1 = cross_val_score(lgbm_basic_model, train_X, train_y, scoring='neg_mean_squared_error', cv=kfold)

# 평가 결과를 담아준다.
mse_score_list.append(abs(r1.mean()))

# 학습 모델 이름을 담아준다.
model_name_list.append("LGBM Basic")

print(mse_score_list)
print(model_name_list)

[418468135.55954295, 408972677.95849323]
['GradientBoost Basic', 'LGBM Basic']


In [12]:
# XGBoost
xgb_basic_model = XGBRegressor(verbose=-1, slient=True)

# 교차 검증을 수행한다
r1 = cross_val_score(xgb_basic_model, train_X, train_y, scoring='neg_mean_squared_error', cv=kfold)

# 평가 결과를 담아준다.
mse_score_list.append(abs(r1.mean()))

# 학습 모델 이름을 담아준다.
model_name_list.append("XGBoost Basic")

print(mse_score_list)
print(model_name_list)

[418468135.55954295, 408972677.95849323, 463018843.74470484]
['GradientBoost Basic', 'LGBM Basic', 'XGBoost Basic']


In [13]:
# Voting을 구성하기 위한 모델 목록
model10 = GradientBoostingRegressor()
model11 = LGBMRegressor(verbose=-1)
model12 = XGBRegressor(verbose=-1, slient=True)

hard_voting_basic_model_list = [
    ('model10', model10),
    ('model11', model11),
    ('model12', model12),
]

hard_voting_basic_model = VotingRegressor(estimators=hard_voting_basic_model_list)

# 교차 검증을 수행한다
r1 = cross_val_score(hard_voting_basic_model, train_X, train_y, scoring='neg_mean_squared_error', cv=kfold)

# 평가 결과를 담아준다.
mse_score_list.append(abs(r1.mean()))

# 학습 모델 이름을 담아준다.
model_name_list.append("HardVoting Basic")

print(mse_score_list)
print(model_name_list)

[418468135.55954295, 408972677.95849323, 463018843.74470484, 396815149.0210506]
['GradientBoost Basic', 'LGBM Basic', 'XGBoost Basic', 'HardVoting Basic']


In [14]:
d1 = {
    'mse' : mse_score_list
}

test_df = pd.DataFrame(d1, index=model_name_list)
test_df

Unnamed: 0,mse
GradientBoost Basic,418468100.0
LGBM Basic,408972700.0
XGBoost Basic,463018800.0
HardVoting Basic,396815100.0


In [15]:
10/0

ZeroDivisionError: division by zero

### 하이퍼 파라미터 튜닝

In [None]:
# 튜닝할 하이퍼 파라미터 후보 값
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

# 사용할 모델 객체를 생성한다.
gb_tuning_model = GradientBoostingRegressor()

# 최적의 하이퍼 파라미터를 찾는다.
gb_grid_clf = GridSearchCV(gb_tuning_model, param_grid=params, scoring='neg_mean_squared_error', cv=kfold)
gb_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
mse_score_list.append(abs(gb_grid_clf.best_score_))

# 학습 모델 이름을 담아준다.
model_name_list.append("GradientBoost Tuning")

In [None]:
# 튜닝할 하이퍼 파라미터 후보 값
params = {
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}

# 사용할 모델 객체를 생성한다.
lgbm_tuning_model = LGBMRegressor()

# 최적의 하이퍼 파라미터를 찾는다.
lgbm_grid_clf = GridSearchCV(lgbm_tuning_model, param_grid=params, scoring='neg_mean_squared_error', cv=kfold)
lgbm_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
mse_score_list.append(abs(lgbm_grid_clf.best_score_))

# 학습 모델 이름을 담아준다.
model_name_list.append("LGBM Tuning")

print(mse_score_list)
print(model_name_list)

In [None]:
# 튜닝할 하이퍼 파라미터 후보 값
params = {
    'booster' : ['gbtree', 'gblinear'],
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 0, 10, 100, 1000, 10000]
}
# 사용할 모델 객체를 생성한다.
xgb_tuning_model = XGBRegressor()

# 최적의 하이퍼 파라미터를 찾는다.
xgb_grid_clf = GridSearchCV(xgb_tuning_model, param_grid=params, scoring='neg_mean_squared_error', cv=kfold)
xgb_grid_clf.fit(train_X, train_y)

# 평가 결과를 담아준다.
mse_score_list.append(abs(xgb_grid_clf.best_score_))

# 학습 모델 이름을 담아준다.
model_name_list.append("XGB Tuning")

print(mse_score_list)
print(model_name_list)

In [None]:
# Voting을 구성하기 위한 모델 목록
model10 = GradientBoostingRegressor()
model11 = LGBMRegressor(verbose=-1)
model12 = XGBRegressor(verbose=-1, slient=True)

# 튜닝 과정에서 찾은 최적의 하이퍼 파라미터를 셋팅해준다.
model10.set_params(**gb_grid_clf.best_params_)
model11.set_params(**lgbm_grid_clf.best_params_)
model12.set_params(**xgb_grid_clf.best_params_)


hard_voting_tuning_model_list = [
    ('model10', model10),
    ('model11', model11),
    ('model12', model12),
]

hard_voting_tuning_model = VotingRegressor(estimators=hard_voting_tuning_model_list)

# 교차 검증을 수행한다
r1 = cross_val_score(hard_voting_tuning_model, train_X, train_y, scoring='neg_mean_squared_error', cv=kfold)

# 평가 결과를 담아준다.
mse_score_list.append(abs(r1.mean()))

# 학습 모델 이름을 담아준다.
model_name_list.append("HardVoting Tuning")

In [None]:
d1 = {
    'mse' : mse_score_list
}
result_df = pd.DataFrame(d1, index=model_name_list)
result_df.sort_values(by='mse', inplace=True)
result_df

In [None]:
# 최종 모델을 생성하고 전체 데이터를 학습 시킨다.
best_model = XGBClassifier()
best_model.set_params(**xgboost_grid_clf.best_params_)
best_model.fit(train_X, train_y)
best_model

In [None]:
# 학습 모델 등을 저장한다.
with open(best_model_path, 'wb') as fp :
    pickle.dump(best_model, fp)
    pickle.dump(encoder1, fp)
    pickle.dump(scaler1, fp)

print('저장완료')

In [None]:
# 저장 확인
with open(best_model_path, 'rb') as fp :
    test_model = pickle.load(fp)
    test_encoder = pickle.load(fp)
    test_scaler1 = pickle.load(fp)

In [None]:
r1 = test_model.predict(train_X)
r2 = f1_score(train_y, r1)
r2