In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import shutil

# 경고 뜨지 않게..
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False

# 랜덤 모듈 
import random

# 학습 모델 저장 및 복원
import pickle

# 딥러닝 라이브러리
import tensorflow as tf
# 신경망 모델을 관리하는 객체
from tensorflow.keras.models import Sequential
# 선형 회귀 레이어
from tensorflow.keras.layers import Dense
# 활성화 함수를 정의하는 객체
from tensorflow.keras.layers import Activation
# CNN : 커널을 통해 합성곱을 구하는 것. 이미지의 특징이 두드러 지게 한다.
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Conv1D
# MaxPooling : 커널내에서 가장 큰 값을 추출하는 방식으로 불필요한 부분을 제거한다.
from tensorflow.keras.layers import MaxPool2D
from tensorflow.keras.layers import MaxPool1D
# Flatten : 다차원의 이미지 데이터를 선형 회귀 은닉층으로 전달하기 전에
# 1차원으로 변환하는 것
from tensorflow.keras.layers import Flatten
# Dropout : 이미지나 영상, 음파 등의 데이터는 오랫동안 학습을 진행 시켜야 한다.
# 하지만 너무 빨리 과적합이 발생되면 조기 중단 때문에 학습 횟수가 줄어들 수 있다.
# 이에 은닉의 노드를 일부 사용하지 않으므로써 과적합이 빨리 오는 것을 예방하고
# 오랫동한 학습이 진행될 수 있도록 한다.
from tensorflow.keras.layers import Dropout
# Embedding : 단어의 수를 조정한다.
from tensorflow.keras.layers import Embedding

# LSTM
from tensorflow.keras.layers import LSTM



# 원핫 인코딩을 수행하는 함수
from tensorflow.keras.utils import to_categorical

# 저장된 학습모델을 복원한다.
from tensorflow.keras.models import load_model

# 모델을 자동 저장한다.
from tensorflow.keras.callbacks import ModelCheckpoint
# 성능이 더이상 좋아지지 않을 경우 중단 시킨다.
from tensorflow.keras.callbacks import EarlyStopping

# 문장을 잘라준다.
from tensorflow.keras.preprocessing.text import Tokenizer
# 모든 문장 데이터의 단어 데이터의 수를 동일한 수로 맞춰준다.
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 문자열을 가지고 단어 사전을 만들어준다.
from tensorflow.keras.preprocessing.text import text_to_word_sequence



# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 표준화
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 문자열 -> 숫자
from sklearn.preprocessing import LabelEncoder

# 전체데이터를 학습용과 검증으로 나눈다.
from sklearn.model_selection import train_test_split

# 랜덤시드 설정
# 데이터를 랜덤하게 섞거나 가중치를 랜덤하게 설정하는 등..
# 작업에서 랜덤을 적용하는 경우가 더러 있다.
# 이에, 시드를 고정시킨다.
random_seed = 1

np.random.seed(random_seed)
random.seed(random_seed)
tf.random.set_seed(random_seed)

# 현재 프로젝트에서 GPU 메모리 사용을 필요한 만큼만 쓸 수 있도록 한다.
# 컴퓨터에 있는 GPU 정보들을 가져온다.
gpus = tf.config.experimental.list_physical_devices('GPU')
# gpu가 있다면...
if len(gpus) > 0 :
    try :
        for gpu in gpus :
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e :
        print(e)

### 데이터를 불러온다.

In [2]:
picher = pd.read_csv('./data/picher_stats_2017.csv')
batter = pd.read_csv('./data/batter_stats_2017.csv')

display(picher)
display(batter)

Unnamed: 0,선수명,팀명,승,패,세,홀드,블론,경기,선발,이닝,...,홈런/9,BABIP,LOB%,ERA,RA9-WAR,FIP,kFIP,WAR,연봉(2018),연봉(2017)
0,켈리,SK,16,7,0,0,0,30,30,190.0,...,0.76,0.342,73.7,3.60,6.91,3.69,3.44,6.62,140000,85000
1,소사,LG,11,11,1,0,0,30,29,185.1,...,0.53,0.319,67.1,3.88,6.80,3.52,3.41,6.08,120000,50000
2,양현종,KIA,20,6,0,0,0,31,31,193.1,...,0.79,0.332,72.1,3.44,6.54,3.94,3.82,5.64,230000,150000
3,차우찬,LG,10,7,0,0,0,28,28,175.2,...,1.02,0.298,75.0,3.43,6.11,4.20,4.03,4.63,100000,100000
4,레일리,롯데,13,7,0,0,0,30,30,187.1,...,0.91,0.323,74.1,3.80,6.13,4.36,4.31,4.38,111000,85000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,장민재,한화,2,5,0,0,2,33,5,62.2,...,1.58,0.355,56.9,7.76,-1.21,6.21,6.48,-0.47,7100,8100
148,정용운,KIA,3,2,0,0,0,25,11,59.1,...,1.06,0.263,65.4,5.92,0.39,6.41,6.77,-0.49,7500,3100
149,노경은,롯데,0,2,0,0,0,9,2,14.2,...,2.45,0.382,52.8,11.66,-0.83,8.03,8.29,-0.61,10000,16000
150,김승현,삼성,0,3,0,1,0,41,0,43.2,...,1.44,0.341,73.9,5.77,-0.40,6.87,6.95,-0.70,4000,2900


Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,...,도루,BABIP,타율,출루율,장타율,OPS,wOBA,WAR,연봉(2018),연봉(2017)
0,최정,SK,130,527,430,136,46,89,113,70,...,1,0.316,0.316,0.427,0.684,1.111,0.442,7.30,120000,120000
1,최형우,KIA,142,629,514,176,26,98,120,96,...,0,0.362,0.342,0.450,0.576,1.026,0.430,7.20,150000,150000
2,박건우,두산,131,543,483,177,20,91,78,41,...,20,0.39,0.366,0.424,0.582,1.006,0.424,7.04,37000,19500
3,나성범,NC,125,561,498,173,24,103,99,48,...,17,0.413,0.347,0.415,0.584,0.999,0.416,5.64,43000,35000
4,손아섭,롯데,144,667,576,193,20,113,80,83,...,25,0.374,0.335,0.420,0.514,0.934,0.398,5.60,150000,65000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,김민식,KIA,135,391,351,78,4,39,40,26,...,3,0.252,0.222,0.284,0.293,0.577,0.261,-1.07,15000,6000
187,장민석,한화,88,327,299,83,1,38,21,15,...,8,0.34,0.278,0.311,0.318,0.629,0.283,-1.08,11000,10000
188,채은성,LG,114,372,333,89,2,28,35,17,...,5,0.327,0.267,0.323,0.339,0.662,0.298,-1.12,11000,16000
189,김주형,KIA,57,116,106,18,0,11,10,8,...,0,0.191,0.170,0.233,0.217,0.450,0.210,-1.20,7500,11000


In [3]:
# 정보 확인
display(picher.info())
display(batter.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       152 non-null    object 
 1   팀명        152 non-null    object 
 2   승         152 non-null    int64  
 3   패         152 non-null    int64  
 4   세         152 non-null    int64  
 5   홀드        152 non-null    int64  
 6   블론        152 non-null    int64  
 7   경기        152 non-null    int64  
 8   선발        152 non-null    int64  
 9   이닝        152 non-null    float64
 10  삼진/9      152 non-null    float64
 11  볼넷/9      152 non-null    float64
 12  홈런/9      152 non-null    float64
 13  BABIP     152 non-null    float64
 14  LOB%      152 non-null    float64
 15  ERA       152 non-null    float64
 16  RA9-WAR   152 non-null    float64
 17  FIP       152 non-null    float64
 18  kFIP      152 non-null    float64
 19  WAR       152 non-null    float64
 20  연봉(2018)  152 non-null    int64 

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       191 non-null    object 
 1   팀명        191 non-null    object 
 2   경기        191 non-null    int64  
 3   타석        191 non-null    int64  
 4   타수        191 non-null    int64  
 5   안타        191 non-null    int64  
 6   홈런        191 non-null    int64  
 7   득점        191 non-null    int64  
 8   타점        191 non-null    int64  
 9   볼넷        191 non-null    int64  
 10  삼진        191 non-null    int64  
 11  도루        191 non-null    int64  
 12  BABIP     191 non-null    object 
 13  타율        191 non-null    float64
 14  출루율       191 non-null    float64
 15  장타율       191 non-null    float64
 16  OPS       191 non-null    float64
 17  wOBA      191 non-null    float64
 18  WAR       191 non-null    float64
 19  연봉(2018)  191 non-null    int64  
 20  연봉(2017)  191 non-null    int64 

None

In [4]:
# 결측치
display(picher.isna().sum())
display(batter.isna().sum())

선수명         0
팀명          0
승           0
패           0
세           0
홀드          0
블론          0
경기          0
선발          0
이닝          0
삼진/9        0
볼넷/9        0
홈런/9        0
BABIP       0
LOB%        0
ERA         0
RA9-WAR     0
FIP         0
kFIP        0
WAR         0
연봉(2018)    0
연봉(2017)    0
dtype: int64

선수명         0
팀명          0
경기          0
타석          0
타수          0
안타          0
홈런          0
득점          0
타점          0
볼넷          0
삼진          0
도루          0
BABIP       0
타율          0
출루율         0
장타율         0
OPS         0
wOBA        0
WAR         0
연봉(2018)    0
연봉(2017)    0
dtype: int64

### 연봉처리

In [5]:
# 투수에서 연봉정보를 가져온다.
a1 = picher[['연봉(2018)', '연봉(2017)']]
a1

Unnamed: 0,연봉(2018),연봉(2017)
0,140000,85000
1,120000,50000
2,230000,150000
3,100000,100000
4,111000,85000
...,...,...
147,7100,8100
148,7500,3100
149,10000,16000
150,4000,2900


In [6]:
# 연봉 차이를 구한다.
a2 = a1['연봉(2018)'] - a1['연봉(2017)']
a2

0      55000
1      70000
2      80000
3          0
4      26000
       ...  
147    -1000
148     4400
149    -6000
150     1100
151     1000
Length: 152, dtype: int64

In [7]:
# 연봉증감액을 추가해준다.
picher['연봉증감액'] = a2
picher

Unnamed: 0,선수명,팀명,승,패,세,홀드,블론,경기,선발,이닝,...,BABIP,LOB%,ERA,RA9-WAR,FIP,kFIP,WAR,연봉(2018),연봉(2017),연봉증감액
0,켈리,SK,16,7,0,0,0,30,30,190.0,...,0.342,73.7,3.60,6.91,3.69,3.44,6.62,140000,85000,55000
1,소사,LG,11,11,1,0,0,30,29,185.1,...,0.319,67.1,3.88,6.80,3.52,3.41,6.08,120000,50000,70000
2,양현종,KIA,20,6,0,0,0,31,31,193.1,...,0.332,72.1,3.44,6.54,3.94,3.82,5.64,230000,150000,80000
3,차우찬,LG,10,7,0,0,0,28,28,175.2,...,0.298,75.0,3.43,6.11,4.20,4.03,4.63,100000,100000,0
4,레일리,롯데,13,7,0,0,0,30,30,187.1,...,0.323,74.1,3.80,6.13,4.36,4.31,4.38,111000,85000,26000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,장민재,한화,2,5,0,0,2,33,5,62.2,...,0.355,56.9,7.76,-1.21,6.21,6.48,-0.47,7100,8100,-1000
148,정용운,KIA,3,2,0,0,0,25,11,59.1,...,0.263,65.4,5.92,0.39,6.41,6.77,-0.49,7500,3100,4400
149,노경은,롯데,0,2,0,0,0,9,2,14.2,...,0.382,52.8,11.66,-0.83,8.03,8.29,-0.61,10000,16000,-6000
150,김승현,삼성,0,3,0,1,0,41,0,43.2,...,0.341,73.9,5.77,-0.40,6.87,6.95,-0.70,4000,2900,1100


In [8]:
# 타자 데이터에서 연봉 증감액을 구한다.
a1 = batter['연봉(2018)'] - batter['연봉(2017)']
a1

0          0
1          0
2      17500
3       8000
4      85000
       ...  
186     9000
187     1000
188    -5000
189    -3500
190      500
Length: 191, dtype: int64

In [9]:
# 타자 데이터에 연봉 증감액을 넣어준다.
batter['연봉증감액'] = a1
batter

Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,...,BABIP,타율,출루율,장타율,OPS,wOBA,WAR,연봉(2018),연봉(2017),연봉증감액
0,최정,SK,130,527,430,136,46,89,113,70,...,0.316,0.316,0.427,0.684,1.111,0.442,7.30,120000,120000,0
1,최형우,KIA,142,629,514,176,26,98,120,96,...,0.362,0.342,0.450,0.576,1.026,0.430,7.20,150000,150000,0
2,박건우,두산,131,543,483,177,20,91,78,41,...,0.39,0.366,0.424,0.582,1.006,0.424,7.04,37000,19500,17500
3,나성범,NC,125,561,498,173,24,103,99,48,...,0.413,0.347,0.415,0.584,0.999,0.416,5.64,43000,35000,8000
4,손아섭,롯데,144,667,576,193,20,113,80,83,...,0.374,0.335,0.420,0.514,0.934,0.398,5.60,150000,65000,85000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,김민식,KIA,135,391,351,78,4,39,40,26,...,0.252,0.222,0.284,0.293,0.577,0.261,-1.07,15000,6000,9000
187,장민석,한화,88,327,299,83,1,38,21,15,...,0.34,0.278,0.311,0.318,0.629,0.283,-1.08,11000,10000,1000
188,채은성,LG,114,372,333,89,2,28,35,17,...,0.327,0.267,0.323,0.339,0.662,0.298,-1.12,11000,16000,-5000
189,김주형,KIA,57,116,106,18,0,11,10,8,...,0.191,0.170,0.233,0.217,0.450,0.210,-1.20,7500,11000,-3500


In [10]:
# 각 데이터에서 연봉(2018)을 제거한다.
picher.drop('연봉(2018)', axis=1, inplace=True)
batter.drop('연봉(2018)', axis=1, inplace=True)

display(picher)
display(batter)

Unnamed: 0,선수명,팀명,승,패,세,홀드,블론,경기,선발,이닝,...,홈런/9,BABIP,LOB%,ERA,RA9-WAR,FIP,kFIP,WAR,연봉(2017),연봉증감액
0,켈리,SK,16,7,0,0,0,30,30,190.0,...,0.76,0.342,73.7,3.60,6.91,3.69,3.44,6.62,85000,55000
1,소사,LG,11,11,1,0,0,30,29,185.1,...,0.53,0.319,67.1,3.88,6.80,3.52,3.41,6.08,50000,70000
2,양현종,KIA,20,6,0,0,0,31,31,193.1,...,0.79,0.332,72.1,3.44,6.54,3.94,3.82,5.64,150000,80000
3,차우찬,LG,10,7,0,0,0,28,28,175.2,...,1.02,0.298,75.0,3.43,6.11,4.20,4.03,4.63,100000,0
4,레일리,롯데,13,7,0,0,0,30,30,187.1,...,0.91,0.323,74.1,3.80,6.13,4.36,4.31,4.38,85000,26000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,장민재,한화,2,5,0,0,2,33,5,62.2,...,1.58,0.355,56.9,7.76,-1.21,6.21,6.48,-0.47,8100,-1000
148,정용운,KIA,3,2,0,0,0,25,11,59.1,...,1.06,0.263,65.4,5.92,0.39,6.41,6.77,-0.49,3100,4400
149,노경은,롯데,0,2,0,0,0,9,2,14.2,...,2.45,0.382,52.8,11.66,-0.83,8.03,8.29,-0.61,16000,-6000
150,김승현,삼성,0,3,0,1,0,41,0,43.2,...,1.44,0.341,73.9,5.77,-0.40,6.87,6.95,-0.70,2900,1100


Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,...,도루,BABIP,타율,출루율,장타율,OPS,wOBA,WAR,연봉(2017),연봉증감액
0,최정,SK,130,527,430,136,46,89,113,70,...,1,0.316,0.316,0.427,0.684,1.111,0.442,7.30,120000,0
1,최형우,KIA,142,629,514,176,26,98,120,96,...,0,0.362,0.342,0.450,0.576,1.026,0.430,7.20,150000,0
2,박건우,두산,131,543,483,177,20,91,78,41,...,20,0.39,0.366,0.424,0.582,1.006,0.424,7.04,19500,17500
3,나성범,NC,125,561,498,173,24,103,99,48,...,17,0.413,0.347,0.415,0.584,0.999,0.416,5.64,35000,8000
4,손아섭,롯데,144,667,576,193,20,113,80,83,...,25,0.374,0.335,0.420,0.514,0.934,0.398,5.60,65000,85000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,김민식,KIA,135,391,351,78,4,39,40,26,...,3,0.252,0.222,0.284,0.293,0.577,0.261,-1.07,6000,9000
187,장민석,한화,88,327,299,83,1,38,21,15,...,8,0.34,0.278,0.311,0.318,0.629,0.283,-1.08,10000,1000
188,채은성,LG,114,372,333,89,2,28,35,17,...,5,0.327,0.267,0.323,0.339,0.662,0.298,-1.12,16000,-5000
189,김주형,KIA,57,116,106,18,0,11,10,8,...,0,0.191,0.170,0.233,0.217,0.450,0.210,-1.20,11000,-3500


In [11]:
# 저장한다.
picher.to_csv('./data/pitcher1.csv', encoding='utf-8-sig', index=False)
batter.to_csv('./data/batter1.csv', encoding='utf-8-sig', index=False)