In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
#plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### 데이터 준비

In [3]:
# 파일 불러오기
pit_data_2023 = pd.read_csv('2023_투구_statiz.csv')
pit_data_2024 = pd.read_csv('2024_투구_statiz.csv')

In [4]:
# 0번째 행 삭제
pit_data_2023 = pit_data_2023.drop(0)
pit_data_2024 = pit_data_2024.drop(0)

In [5]:
# 삭제할 칼럼 리스트
cols_to_drop = ['Rank', 'Team', 'Sort▼', 'RA9', 'rRA9', 'rRA9pf', 'rRA', 'ROE', 'IB']

# 칼럼 삭제하기 
pit_data_2023.drop(cols_to_drop, axis=1, inplace=True)
pit_data_2024.drop(cols_to_drop, axis=1, inplace=True)

In [6]:
# 칼럼명 변경하기 
pit_data_2023.rename(columns={'WAR▼': 'WAR', 'Name': '선수'}, inplace=True)
pit_data_2024.rename(columns={'WAR▼': 'WAR', 'Name': '선수'}, inplace=True)

In [7]:
display(pit_data_2023.head())
display(pit_data_2024.head())

Unnamed: 0,선수,G,GS,GR,GF,CG,SHO,W,L,S,...,HP,SO,BK,WP,ERA,FIP,WHIP,WAR,팀,pid
1,페냐,32,32,0,0,0,0,11,11,0,...,18,147,4,16,3.6,4.06,1.17,4.28,한화 이글스,15146.0
2,문동주,23,23,0,0,0,0,8,8,0,...,4,95,1,5,3.72,3.65,1.31,3.08,한화 이글스,15013.0
3,산체스,24,24,0,0,0,0,7,8,0,...,6,99,0,3,3.79,3.92,1.3,2.59,한화 이글스,15643.0
4,주현상,55,0,55,9,0,0,2,2,0,...,3,45,0,0,1.96,3.25,0.84,2.44,한화 이글스,11415.0
5,이태양,50,12,38,6,0,0,3,3,0,...,3,72,0,1,3.23,3.6,1.21,1.78,한화 이글스,10609.0


Unnamed: 0,선수,G,GS,GR,GF,CG,SHO,W,L,S,...,HP,SO,BK,WP,ERA,FIP,WHIP,WAR,팀,pid
1,류현진,28,28,0,0,0,0,10,8,0,...,3,135,0,1,3.87,3.67,1.36,4.44,한화 이글스,10590.0
2,주현상,65,0,65,51,0,0,8,4,23,...,1,64,0,1,2.65,3.85,0.84,3.13,한화 이글스,11415.0
3,와이스,16,16,0,0,0,0,5,5,0,...,3,98,0,6,3.73,3.65,1.16,2.92,한화 이글스,16153.0
4,문동주,21,21,0,0,0,0,7,7,0,...,4,96,1,8,5.17,4.71,1.67,1.77,한화 이글스,15013.0
5,바리아,20,19,1,0,0,0,6,7,0,...,0,83,0,6,5.15,3.85,1.42,1.69,한화 이글스,16150.0


In [8]:
# 데이터에 연도 컬럼 추가
pit_data_2023['연도'] = 2023
pit_data_2024['연도'] = 2024

In [9]:
# 세로로 합치기
pit_data_all = pd.concat([pit_data_2023, pit_data_2024], ignore_index=True)
pit_data_all

Unnamed: 0,선수,G,GS,GR,GF,CG,SHO,W,L,S,...,SO,BK,WP,ERA,FIP,WHIP,WAR,팀,pid,연도
0,페냐,32,32,0,0,0,0,11,11,0,...,147,4,16,3.6,4.06,1.17,4.28,한화 이글스,15146.0,2023
1,문동주,23,23,0,0,0,0,8,8,0,...,95,1,5,3.72,3.65,1.31,3.08,한화 이글스,15013.0,2023
2,산체스,24,24,0,0,0,0,7,8,0,...,99,0,3,3.79,3.92,1.3,2.59,한화 이글스,15643.0,2023
3,주현상,55,0,55,9,0,0,2,2,0,...,45,0,0,1.96,3.25,0.84,2.44,한화 이글스,11415.0,2023
4,이태양,50,12,38,6,0,0,3,3,0,...,72,0,1,3.23,3.6,1.21,1.78,한화 이글스,10609.0,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,전준표,18,5,13,4,0,0,2,4,0,...,18,0,4,6.83,7.09,1.99,-0.35,키움 히어로즈,16122.0,2024
581,오석주,17,0,17,7,0,0,1,1,0,...,16,0,2,11.12,4.4,2.29,-0.5,키움 히어로즈,12860.0,2024
582,조영건,25,3,22,3,0,0,2,1,0,...,31,0,3,8.01,6.01,1.91,-0.55,키움 히어로즈,14132.0,2024
583,윤석원,11,1,10,3,0,0,0,1,0,...,1,1,3,11.42,7.72,2.19,-0.66,키움 히어로즈,15071.0,2024


In [10]:
# 각 컬럼별 결측치 개수
print(pit_data_all.isna().sum())

선수      0
G       0
GS      0
GR      0
GF      0
CG      0
SHO     0
W       0
L       0
S       0
HD      0
IP      0
ER      0
R       0
TBF     0
H       0
2B      0
3B      0
HR      0
BB      0
HP      0
SO      0
BK      0
WP      0
ERA     2
FIP     1
WHIP    1
WAR     2
팀       0
pid     0
연도      0
dtype: int64


In [11]:
# 결측치가 하나라도 포함된 행 전체
pit_data_all[pit_data_all.isna().any(axis=1)]

Unnamed: 0,선수,G,GS,GR,GF,CG,SHO,W,L,S,...,SO,BK,WP,ERA,FIP,WHIP,WAR,팀,pid,연도
30,박상언,0,0,0,0,0,0,0,0,0,...,0,0,0,,,,,한화 이글스,12587.0,2023
321,정우람,1,1,0,0,0,0,0,0,0,...,0,0,0,,4.2,99.99,,한화 이글스,10156.0,2024


In [12]:
# 결측치(NaN)가 있는 행 전체 삭제
pit_data_all = pit_data_all.dropna()

In [13]:
pit_data_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 583 entries, 0 to 584
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   선수      583 non-null    object 
 1   G       583 non-null    object 
 2   GS      583 non-null    object 
 3   GR      583 non-null    object 
 4   GF      583 non-null    object 
 5   CG      583 non-null    object 
 6   SHO     583 non-null    object 
 7   W       583 non-null    object 
 8   L       583 non-null    object 
 9   S       583 non-null    object 
 10  HD      583 non-null    object 
 11  IP      583 non-null    object 
 12  ER      583 non-null    object 
 13  R       583 non-null    object 
 14  TBF     583 non-null    object 
 15  H       583 non-null    object 
 16  2B      583 non-null    object 
 17  3B      583 non-null    object 
 18  HR      583 non-null    object 
 19  BB      583 non-null    object 
 20  HP      583 non-null    object 
 21  SO      583 non-null    object 
 22  BK     

In [15]:
# 정수형 데이터로 변경 
pit_data_all['pid'] = pit_data_all['pid'].astype('int64')

cols_not_float = ['선수', '팀', 'pid', '연도']
cols_to_float = [col for col in pit_data_all.columns if col not in cols_not_float]
pit_data_all[cols_to_float] = pit_data_all[cols_to_float].apply(pd.to_numeric, errors='coerce')

In [17]:
pit_data_all

Unnamed: 0,선수,G,GS,GR,GF,CG,SHO,W,L,S,...,SO,BK,WP,ERA,FIP,WHIP,WAR,팀,pid,연도
0,페냐,32,32,0,0,0,0,11,11,0,...,147,4,16,3.60,4.06,1.17,4.28,한화 이글스,15146,2023
1,문동주,23,23,0,0,0,0,8,8,0,...,95,1,5,3.72,3.65,1.31,3.08,한화 이글스,15013,2023
2,산체스,24,24,0,0,0,0,7,8,0,...,99,0,3,3.79,3.92,1.30,2.59,한화 이글스,15643,2023
3,주현상,55,0,55,9,0,0,2,2,0,...,45,0,0,1.96,3.25,0.84,2.44,한화 이글스,11415,2023
4,이태양,50,12,38,6,0,0,3,3,0,...,72,0,1,3.23,3.60,1.21,1.78,한화 이글스,10609,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,전준표,18,5,13,4,0,0,2,4,0,...,18,0,4,6.83,7.09,1.99,-0.35,키움 히어로즈,16122,2024
581,오석주,17,0,17,7,0,0,1,1,0,...,16,0,2,11.12,4.40,2.29,-0.50,키움 히어로즈,12860,2024
582,조영건,25,3,22,3,0,0,2,1,0,...,31,0,3,8.01,6.01,1.91,-0.55,키움 히어로즈,14132,2024
583,윤석원,11,1,10,3,0,0,0,1,0,...,1,1,3,11.42,7.72,2.19,-0.66,키움 히어로즈,15071,2024


In [19]:
# 파일 저장 
pit_data_all.to_csv('투수_스탯_데이터_합본.csv', index=False)