In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
# 파일 불러오기
field_data_21 = pd.read_csv('2021년_투수타자_데이터.csv')
field_data_22 = pd.read_csv('2022년_투수타자_데이터.csv')
#field_data_23 = pd.read_csv('2023년_수비_스탯_연봉_데이터.csv')
#field_data_24 = pd.read_csv('2024년_수비_스탯_연봉_데이터.csv')

field_data_23 = pd.read_csv('2023_수비_statiz_중복제거.csv')
field_data_24 = pd.read_csv('2024_수비_statiz_중복제거.csv')

In [3]:
field_data_21 = field_data_21[field_data_21['포지션구분'] == '타자']
field_data_22 = field_data_22[field_data_22['포지션구분'] == '타자']

In [4]:
# 앞에 추가할 컬럼 리스트
add_cols = ["선수", "팀", 'G', 'GS', 'IP']
last_cols = ['pid', '연도']

# 포지션구분 컬럼 위치 찾기
col_idx = field_data_21.columns.get_loc("포지션구분")
cols_after = list(field_data_21.columns[(col_idx + 1):])  

col_idx = field_data_22.columns.get_loc("포지션구분")
cols_after = list(field_data_22.columns[(col_idx + 1):]) 

final_cols = add_cols + cols_after + last_cols

field_data_selected = field_data_21[final_cols]
field_data_selected = field_data_22[final_cols]

field_data_21 = field_data_21[final_cols]
field_data_22 = field_data_22[final_cols]

In [5]:
field_data_21['pid_new'] = field_data_21['pid_new'].astype(int)
field_data_21['pid_new'] = field_data_21['pid_new'].astype(str)

field_data_22['pid_new'] = field_data_22['pid_new'].astype(int)
field_data_22['pid_new'] = field_data_22['pid_new'].astype(str)

In [6]:
field_data_21.rename(columns={"pid": "기존pid"}, inplace=True)
field_data_21.rename(columns={"pid_new": "pid"}, inplace=True)

field_data_22.rename(columns={"pid": "기존pid"}, inplace=True)
field_data_22.rename(columns={"pid_new": "pid"}, inplace=True)

In [7]:
field_data_23 = field_data_23.drop(columns=['Rank'])
field_data_24 = field_data_24.drop(columns=['Rank'])

In [8]:
field_data_23.rename(columns={'Team': '팀', 'Name': '선수'}, inplace=True)
field_data_23.rename(columns={'Team': '팀', 'Name': '선수'}, inplace=True)

field_data_24.rename(columns={'Team': '팀', 'Name': '선수'}, inplace=True)
field_data_24.rename(columns={'Team': '팀', 'Name': '선수'}, inplace=True)

In [9]:
add_cols = ['선수', '팀', 'G', 'GS', 'IP']

# 나머지 컬럼 (add_cols에 없는 것만 순서 보존해서 추출)
rest_cols = [col for col in field_data_23.columns if col not in add_cols]
rest_cols = [col for col in field_data_24.columns if col not in add_cols]

final_cols = add_cols + rest_cols
field_data_23 = field_data_23[final_cols]
field_data_24 = field_data_24[final_cols]

In [10]:
cols = list(field_data_21.columns)

# "연도"를 빼고
cols.remove('연도')

# 끝에서 두 번째 자리에 삽입
cols.insert(-1, '연도')

# 적용
field_data_21 = field_data_21[cols]
field_data_22 = field_data_22[cols]

In [11]:
field_data_23['연도'] = 2023
field_data_24['연도'] = 2024

In [12]:
team_dict = {
    'KIA 타이거즈': 0,
    '두산 베어스': 1,
    '롯데 자이언츠': 2,
    '삼성 라이온즈': 3,
    'SSG 랜더스': 4,
    'NC 다이노스': 5,
    'LG 트윈스': 6,
    '키움 히어로즈': 7,
    'KT 위즈': 8,
    '한화 이글스': 9,
}
team_dict_reverse = {str(v): k for k, v in team_dict.items()}

In [13]:
# 먼저 문자열로 변환
field_data_23['pid'] = field_data_23['pid'].astype(str)
field_data_24['pid'] = field_data_24['pid'].astype(str)

# 기존 pid를 "기존pid"로 저장 (마지막 문자 제외)
field_data_23['기존pid'] = field_data_23['pid'].str[:-1]
field_data_24['기존pid'] = field_data_24['pid'].str[:-1]

# 마지막 한 글자로 팀 추출 → 숫자(문자)로, 매핑해서 팀명 부여
field_data_23['팀'] = field_data_23['pid'].str[-1].map(team_dict_reverse)
field_data_24['팀'] = field_data_24['pid'].str[-1].map(team_dict_reverse)

In [14]:
display(field_data_21.head(2))
display(field_data_22.head(2))
display(field_data_23.head(2))
display(field_data_24.head(2))

Unnamed: 0,선수,팀,G,GS,IP,WAAwithPOS,TC,PO,Ass,E,...,Blk RAA,Frm RAA,종합 RAA,/144,POSAdj,RAAwithPOS,WAAwoPOS,pid,연도,기존pid
311,하주석,한화 이글스,113,111,923.2,0.692,376.0,131.0,231.0,14.0,...,,,-2.48,-2.58,9.46,6.98,-0.246,109139,2021,10913
312,이해창,한화 이글스,20,16,149.0,0.18,129.0,112.0,16.0,1.0,...,-0.54,,0.37,2.68,1.44,1.81,0.037,103919,2021,10391


Unnamed: 0,선수,팀,G,GS,IP,WAAwithPOS,TC,PO,Ass,E,...,Blk RAA,Frm RAA,종합 RAA,/144,POSAdj,RAAwithPOS,WAAwoPOS,pid,연도,기존pid
284,최재훈,한화 이글스,106,101,853.2,1.479,725.0,644.0,78.0,3.0,...,2.67,,4.51,5.7,9.84,14.35,0.465,101709,2022,10170
285,이성곤,한화 이글스,28,20,175.1,0.048,174.0,161.0,11.0,2.0,...,,,2.07,7.25,-1.6,0.47,0.213,112069,2022,11206


Unnamed: 0,선수,팀,G,GS,IP,WAAwithPOS,TC,PO,Ass,E,...,Blk RAA,Frm RAA,종합 RAA,/144,POSAdj,RAAwithPOS,WAAwoPOS,pid,연도,기존pid
0,최재훈,한화 이글스,120,106,884.1,1.18,774,704,67,3,...,0.23,,1.95,2.24,9.62,11.57,0.199,101709,2023,10170
1,박상언,한화 이글스,79,38,388.2,0.347,368,333,34,1,...,-0.12,,-0.15,-0.24,3.55,3.41,-0.015,125879,2023,12587


Unnamed: 0,선수,팀,G,GS,IP,WAAwithPOS,TC,PO,Ass,E,...,Blk RAA,Frm RAA,종합 RAA,/144,POSAdj,RAAwithPOS,WAAwoPOS,pid,연도,기존pid
0,최재훈,한화 이글스,115,95,791.2,0.902,811,746,60,5,...,-0.79,,1.57,1.95,8.3,9.88,0.144,101709,2024,10170
1,이재원,한화 이글스,71,40,354.0,0.277,332,304,27,1,...,-1.33,,-0.49,-0.99,3.53,3.04,-0.045,100829,2024,10082


In [16]:
df_field_all = pd.concat([field_data_21, field_data_22, field_data_23, field_data_24], ignore_index=True)
df_field_all

Unnamed: 0,선수,팀,G,GS,IP,WAAwithPOS,TC,PO,Ass,E,...,Blk RAA,Frm RAA,종합 RAA,/144,POSAdj,RAAwithPOS,WAAwoPOS,pid,연도,기존pid
0,하주석,한화 이글스,113,111,923.2,0.692,376.0,131.0,231.0,14.0,...,,,-2.48,-2.580,9.460,6.980,-0.246,109139,2021,10913
1,이해창,한화 이글스,20,16,149.0,0.180,129.0,112.0,16.0,1.0,...,-0.54,,0.37,2.680,1.440,1.810,0.037,103919,2021,10391
2,김현민,한화 이글스,3,1,13.0,0.089,7.0,2.0,5.0,0.0,...,,,0.85,24.460,0.050,0.900,0.084,142099,2021,14209
3,이성열,한화 이글스,10,7,64.0,0.019,52.0,46.0,5.0,1.0,...,,,0.76,2.370,-0.560,0.200,0.075,101689,2021,10168
4,장규현,한화 이글스,4,1,16.0,0.019,12.0,12.0,0.0,0.0,...,0.00,,0.05,1.010,0.140,0.190,0.005,148119,2021,14811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,최인호,한화 이글스,66,50,436.0,0.131,108.0,102.0,4.0,2.0,...,0.00,0.0,3.38,7.375,-1.947,1.433,0.309,147079,2024,14707
1522,채은성,한화 이글스,98,95,782.0,-0.699,618.0,588.0,23.0,7.0,...,0.00,0.0,-2.86,-4.202,-4.788,-7.648,-0.261,112159,2024,11215
1523,김인환,한화 이글스,41,30,236.0,-0.517,95.0,89.0,4.0,2.0,...,0.00,0.0,-4.84,-16.999,-0.818,-5.658,-0.442,125899,2024,12589
1524,이명기,한화 이글스,2,0,2.0,0.000,0.0,0.0,0.0,0.0,...,0.00,0.0,0.00,0.000,0.000,0.000,0.000,101199,2024,10119


In [20]:
df_field_all.to_csv('수비_스탯_데이터_2124.csv', index=False)