In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import logging
import matplotlib.font_manager as fm
from sklearn.model_selection import train_test_split
import traceback


In [2]:
# ... existing code ...
# 한글 폰트 설정
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

warnings.filterwarnings('ignore')

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# ... existing code ...

In [3]:
'''
# 한글 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

warnings.filterwarnings('ignore')

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

'''



In [4]:
def load_data(file_path='BASEBALL_stats_15.xlsx'):
    """
    데이터를 로드하고 기본 정보를 출력하는 함수
    """
    logging.info("데이터 로드 시작...")
    try:
        df = pd.read_excel(file_path)
        logging.info(f"데이터 로드 완료. 데이터 크기: {df.shape}")
        
        # 필수 컬럼 확인
        required_columns = ['팀명', '경기장', '홈/원정', '날짜', '타수', '안타', '득점']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"필수 컬럼이 없습니다: {missing_columns}")
            
        # 데이터 기본 정보 출력
        logging.info(f"컬럼 목록: {df.columns.tolist()}")
        logging.info("데이터 기본 정보:")
        logging.info(f"데이터 타입:\n{df.dtypes}")
        logging.info(f"결측치 개수:\n{df.isnull().sum()}")
        
        return df
        
    except Exception as e:
        logging.error(f"데이터 로드 중 오류 발생: {str(e)}")
        raise



In [5]:
def normalize_team_names(team):
    """
    팀명을 정규화하는 함수
    """
    team = str(team).strip()
    team_mapping = {
        '넥센': '히어로즈',
        '키움': '히어로즈',
        'SK': 'SSG',
        'KT': 'KT',
        'NC': 'NC',
        'LG': 'LG',
        '두산': '두산',
        '롯데': '롯데',
        '삼성': '삼성',
        '한화': '한화',
        'KIA': 'KIA'
    }
    return team_mapping.get(team, '기타')



In [6]:
def normalize_stadium(stadium):
    """
    경기장 이름을 정규화하는 함수
    """
    stadium = str(stadium).strip()
    stadium_mapping = {
        '잠실': ['잠실'],
        '고척': ['고척'],
        '문학': ['문학'],
        '수원': ['수원'],
        '대구': ['대구'],
        '사직': ['사직'],
        '광주': ['광주'],
        '대전': ['대전'],
        '창원': ['창원', '마산']
    }
    
    for normalized, variants in stadium_mapping.items():
        if any(variant in stadium for variant in variants):
            return normalized
    return '기타'



In [7]:
def normalize_home_away(value):
    """
    홈/원정 구분을 정규화하는 함수
    """
    value = str(value).strip()
    if '홈' in value:
        return '홈'
    elif '원정' in value:
        return '원정'
    return '알수없음'



In [8]:
def calculate_team_stats(df, window=10):
    """
    팀별 이동평균 통계를 계산하는 함수
    """
    logging.info(f"팀 통계 계산 중 (window={window})...")
    stats_list = []
    
    for team in df['팀명'].unique():
        team_data = df[df['팀명'] == team].sort_values('날짜')
        
        # 통계 계산
        stats = pd.DataFrame({
            '최근승률': team_data['승리여부'].eq('승').rolling(window, min_periods=1).mean(),
            '최근평균득점': team_data['득점'].rolling(window, min_periods=1).mean(),
            '최근평균타율': team_data['안타'].rolling(window, min_periods=1).sum() / 
                        team_data['타수'].rolling(window, min_periods=1).sum(),
            '최근평균출루율': (team_data['안타'] + team_data['볼넷'] + team_data['사구']).rolling(window, min_periods=1).sum() / 
                        (team_data['타수'] + team_data['볼넷'] + team_data['사구']).rolling(window, min_periods=1).sum(),
            '최근평균장타율': team_data['루타'].rolling(window, min_periods=1).sum() / 
                        team_data['타수'].rolling(window, min_periods=1).sum()
        })
        
        stats['팀명'] = team
        stats['날짜'] = team_data['날짜']
        stats_list.append(stats)
    
    return pd.concat(stats_list)



In [20]:
"""
팀별 이동평균 통계를 계산하는 함수
"""
window = 10
logging.info(f"팀 통계 계산 중 (window={window})...")
stats_list = []

for team in df['팀명'].unique():
    team_data = df[df['팀명'] == team].sort_values('날짜')
    
    # 통계 계산
    stats = pd.DataFrame({
        '최근승률': team_data['승리여부'].eq('승').rolling(window, min_periods=1).mean(),
        '최근평균득점': team_data['득점'].rolling(window, min_periods=1).mean(),
        '최근평균타율': team_data['안타'].rolling(window, min_periods=1).sum() / 
                    team_data['타수'].rolling(window, min_periods=1).sum(),
        '최근평균출루율': (team_data['안타'] + team_data['볼넷'] + team_data['사구']).rolling(window, min_periods=1).sum() / 
                    (team_data['타수'] + team_data['볼넷'] + team_data['사구']).rolling(window, min_periods=1).sum(),
        '최근평균장타율': team_data['루타'].rolling(window, min_periods=1).sum() / 
                    team_data['타수'].rolling(window, min_periods=1).sum()
    })
    
    stats['팀명'] = team
    stats['날짜'] = team_data['날짜']
    stats_list.append(stats)

2025-05-15 09:32:08,574 - INFO - 팀 통계 계산 중 (window=10)...


In [25]:
stats

Unnamed: 0,최근승률,최근평균득점,최근평균타율,최근평균출루율,최근평균장타율,팀명,날짜
8730,1.00,5.000000,0.290323,0.352941,0.870968,기타,2021-04-04
8742,1.00,3.500000,0.237288,0.307692,0.644068,기타,2021-04-06
8754,1.00,2.333333,0.202247,0.303922,0.494382,기타,2021-04-07
8764,0.75,3.250000,0.216667,0.313869,0.516667,기타,2021-04-08
8771,0.60,3.600000,0.206667,0.327684,0.473333,기타,2021-04-09
...,...,...,...,...,...,...,...
15191,0.60,4.800000,0.246334,0.325459,0.407625,기타,2025-04-23
15201,0.60,4.500000,0.231454,0.312997,0.394659,기타,2025-04-24
15208,0.60,4.400000,0.241888,0.321900,0.398230,기타,2025-04-25
15220,0.60,4.500000,0.244186,0.317585,0.409884,기타,2025-04-26


In [21]:
team_data

Unnamed: 0,날짜,연도,경기장,팀명,홈/원정,타수,안타,득점,2루타,3루타,홈런,볼넷,사구,희생플라이,타점,병살타,폭투,관중수,타율,출루율,장타율,단타율,경기ID,상대팀,루타,승리여부,최근승률,최근평균득점,최근평균타율,최근평균출루율,최근평균장타율,상대팀최근승률,상대팀최근평균득점,상대팀최근평균타율,상대팀최근평균출루율,상대팀최근평균장타율,주말,월,팀명_인코딩,상대팀_인코딩,경기장_인코딩,홈/원정_인코딩
8730,2021-04-04,2021,문학,기타,홈,31,9,5,1,0,4,3,0,0,5,2,0,2300,0.290,0.353,0.710,0.129,4298,롯데,27,승,1.00,5.000000,0.290323,0.352941,0.870968,0.3,4.2,0.263768,0.338542,0.582609,1,4,5,7,5,1
8742,2021-04-06,2021,문학,기타,홈,28,5,2,1,0,1,3,0,0,2,0,0,2268,0.179,0.258,0.321,0.107,4303,한화,11,승,1.00,3.500000,0.237288,0.307692,0.644068,0.2,3.8,0.242075,0.322165,0.365994,0,4,5,9,5,1
8754,2021-04-07,2021,문학,기타,홈,30,4,0,1,0,0,6,1,0,0,1,1,2264,0.133,0.297,0.167,0.100,4308,한화,6,승,1.00,2.333333,0.202247,0.303922,0.494382,0.2,5.1,0.263305,0.347395,0.411765,0,4,5,9,5,1
8764,2021-04-08,2021,문학,기타,홈,31,8,6,1,0,2,3,1,1,5,0,1,2278,0.258,0.333,0.484,0.161,4313,한화,18,패,0.75,3.250000,0.216667,0.313869,0.516667,0.3,5.1,0.247126,0.335025,0.393678,0,4,5,9,5,1
8771,2021-04-09,2021,잠실,기타,원정,30,5,5,0,0,1,6,4,2,5,1,0,2414,0.167,0.357,0.267,0.133,4321,LG,9,패,0.60,3.600000,0.206667,0.327684,0.473333,0.6,5.0,0.224784,0.339066,0.365994,0,4,5,2,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15191,2025-04-23,2025,수원,기타,원정,44,17,11,1,0,4,2,0,0,11,0,0,7581,0.386,0.413,0.682,0.273,7242,KT,35,패,0.60,4.800000,0.246334,0.325459,0.407625,0.8,5.1,0.283582,0.370079,0.462687,0,4,5,1,7,0
15201,2025-04-24,2025,수원,기타,원정,29,6,6,2,0,0,7,1,1,6,0,0,7815,0.207,0.368,0.276,0.138,7247,KT,10,패,0.60,4.500000,0.231454,0.312997,0.394659,0.9,5.1,0.277778,0.358442,0.464912,0,4,5,1,7,0
15208,2025-04-25,2025,문학,기타,홈,32,9,4,0,0,1,2,1,0,4,1,0,12140,0.281,0.343,0.375,0.250,7252,히어로즈,13,패,0.60,4.400000,0.241888,0.321900,0.398230,0.4,2.9,0.205438,0.271468,0.341390,0,4,5,10,5,1
15220,2025-04-26,2025,문학,기타,홈,33,4,1,2,0,0,1,0,0,1,0,1,18641,0.121,0.147,0.182,0.061,7257,히어로즈,8,승,0.60,4.500000,0.244186,0.317585,0.409884,0.4,2.5,0.195783,0.256267,0.307229,1,4,5,10,5,1


In [24]:
team_data['득점'].rolling(window, min_periods=1).mean()

8730     5.000000
8742     3.500000
8754     2.333333
8764     3.250000
8771     3.600000
           ...   
15191    4.800000
15201    4.500000
15208    4.400000
15220    4.500000
15228    4.400000
Name: 득점, Length: 681, dtype: float64

In [9]:
def determine_match_result(group):
    """
    경기별 승패를 결정하는 함수
    """
    if len(group) != 2:
        return ['무'] * len(group)
    score1, score2 = group['득점'].values
    if score1 > score2:
        return ['승', '패']
    elif score1 < score2:
        return ['패', '승']
    return ['무', '무']



In [10]:
def preprocess_data(df):
    """
    데이터 전처리를 수행하는 함수
    """
    logging.info("데이터 전처리 시작...")
    
    # 날짜 형식 변환
    df['날짜'] = pd.to_datetime(df['날짜'])
    
    # 경기 ID 생성
    df['경기ID'] = df.groupby(['날짜', '경기장']).ngroup()
    
    # 상대팀 정보 추가
    df['상대팀'] = df.groupby(['경기ID'])['팀명'].transform(lambda x: x.iloc[::-1].values if len(x) == 2 else None)
    
    # 팀명, 경기장, 홈/원정 정규화
    df['팀명'] = df['팀명'].apply(normalize_team_names)
    df['상대팀'] = df['상대팀'].apply(normalize_team_names)
    df['경기장'] = df['경기장'].apply(normalize_stadium)
    df['홈/원정'] = df['홈/원정'].apply(normalize_home_away)
    
    # 루타 계산
    df['루타'] = df['안타'] + df['2루타']*2 + df['3루타']*3 + df['홈런']*4
    
    # 승리여부 계산
    df['승리여부'] = df.groupby('경기ID').apply(
        lambda x: pd.Series(determine_match_result(x), index=x.index)
    ).values
    
    # 팀 통계 계산
    team_stats = calculate_team_stats(df)
    df = df.merge(team_stats, on=['팀명', '날짜'], how='left')
    
    # 상대팀 통계 계산
    opponent_stats = team_stats.copy()
    opponent_stats.columns = ['상대팀' + col if col not in ['팀명', '날짜'] else col 
                            for col in opponent_stats.columns]
    opponent_stats = opponent_stats.rename(columns={'팀명': '상대팀'})
    df = df.merge(opponent_stats, on=['상대팀', '날짜'], how='left')
    
    # 시간 관련 피처 생성
    df['주말'] = df['날짜'].dt.dayofweek.isin([5, 6]).astype(int)
    df['월'] = df['날짜'].dt.month
    
    return df



In [11]:
def encode_categorical_features(df):
    """
    범주형 변수를 인코딩하는 함수
    """
    logging.info("범주형 변수 인코딩 중...")
    le = LabelEncoder()
    categorical_columns = ['팀명', '상대팀', '경기장', '홈/원정']
    
    for col in categorical_columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace('nan', '알수없음')
        df[col + '_인코딩'] = le.fit_transform(df[col])
    
    return df



In [12]:
def train_models(X_train_scaled, y_train):
    """
    XGBoost와 LightGBM 모델을 학습하는 함수
    """
    logging.info("\n모델 학습 중...")
    
    # XGBoost 모델
    logging.info("XGBoost 모델 학습...")
    xgb_model = xgb.XGBClassifier(
        learning_rate=0.03,
        max_depth=5,
        n_estimators=300,
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42
    )
    xgb_model.fit(X_train_scaled, y_train)
    
    # LightGBM 모델
    logging.info("LightGBM 모델 학습...")
    
    # LightGBM 데이터셋 생성
    train_data = lgb.Dataset(X_train_scaled, label=y_train)
    
    # LightGBM 파라미터 설정
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'max_depth': 4,
        'num_leaves': 15,
        'min_child_samples': 20,
        'min_child_weight': 1e-3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_gain_to_split': 1e-7,
        'min_data_in_leaf': 20,
        'max_bin': 255,
        'verbose': -1,
        'random_state': 42,
        'force_col_wise': True  # 멀티스레딩 방식 강제 지정
    }
    
    # 모델 학습
    lgb_model = lgb.train(
        params,
        train_data,
        num_boost_round=200
    )
    
    return xgb_model, lgb_model



In [13]:
def visualize_feature_importance(model, features, output_file):
    """
    특성 중요도를 시각화하는 함수
    """
    feature_names_korean = {
        '팀명_인코딩': '팀명',
        '상대팀_인코딩': '상대팀',
        '경기장_인코딩': '경기장',
        '홈/원정_인코딩': '홈/원정',
        '최근승률': '최근 승률',
        '최근평균득점': '최근 평균 득점',
        '최근평균타율': '최근 평균 타율',
        '최근평균출루율': '최근 평균 출루율',
        '최근평균장타율': '최근 평균 장타율',
        '상대팀최근승률': '상대팀 최근 승률',
        '상대팀최근평균득점': '상대팀 최근 평균 득점',
        '주말': '주말 경기',
        '월': '월별 경기'
    }
    
    plt.figure(figsize=(12, 8))
    importance = pd.Series(model.feature_importances_, 
                         index=[feature_names_korean[f] for f in features])
    importance.sort_values(ascending=True).plot(kind='barh')
    plt.title('XGBoost 특성 중요도', fontsize=14, pad=20)
    plt.xlabel('중요도', fontsize=12)
    plt.ylabel('특성', fontsize=12)
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()



In [14]:
def main():
    try:
        # 1. 데이터 로드
        df = load_data()
        
        # 2. 데이터 전처리
        df = preprocess_data(df)
        
        # 3. 범주형 변수 인코딩
        df = encode_categorical_features(df)
        
        # 4. 특성 선택
        features = ['팀명_인코딩', '상대팀_인코딩', '경기장_인코딩', '홈/원정_인코딩',
                   '최근승률', '최근평균득점', '최근평균타율', '최근평균출루율', '최근평균장타율',
                   '상대팀최근승률', '상대팀최근평균득점', '주말', '월']
        
        # 5. 데이터 분할
        X = df[features]
        y = (df['승리여부'] == '승').astype(int)
        
        tscv = TimeSeriesSplit(n_splits=5)
        splits = list(tscv.split(X))
        train_index, test_index = splits[-1]
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # 6. 스케일링
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # 7. 모델 학습
        xgb_model, lgb_model = train_models(X_train_scaled, y_train)
        
        # 8. 예측
        xgb_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
        lgb_pred_proba = lgb_model.predict(X_test_scaled)
        
        # 9. 앙상블
        ensemble_pred_proba = 0.6 * xgb_pred_proba + 0.4 * lgb_pred_proba
        ensemble_predictions = (ensemble_pred_proba > 0.5).astype(int)
        
        # 10. 성능 평가
        accuracy = accuracy_score(y_test, ensemble_predictions)
        auc = roc_auc_score(y_test, ensemble_pred_proba)
        
        logging.info(f'\n앙상블 모델 정확도: {accuracy:.4f}')
        logging.info(f'앙상블 모델 AUC: {auc:.4f}')
        
        # 11. 특성 중요도 시각화
        visualize_feature_importance(xgb_model, features, 'feature_importance.png')
        
        # 12. 결과 저장
        results = pd.DataFrame({
            '실제값': y_test,
            '예측값': ensemble_predictions,
            '예측확률': ensemble_pred_proba
        })
        results.to_csv('prediction_results.csv', index=False, encoding='utf-8-sig')
        
        logging.info("\n모든 과정이 완료되었습니다.")
        
    except Exception as e:
        logging.error(f"\n오류 발생: {str(e)}")
        logging.error("\n상세 오류 정보:")
        logging.error(traceback.format_exc())



# ----Main----

In [26]:
try:
    # 1. 데이터 로드
    df_origin = load_data()
    df = df_origin.copy()
    
    # 2. 데이터 전처리
    df = preprocess_data(df)
    
    # 3. 범주형 변수 인코딩
    df = encode_categorical_features(df)
    
    # 4. 특성 선택
    features = ['팀명_인코딩', '상대팀_인코딩', '경기장_인코딩', '홈/원정_인코딩',
                '최근승률', '최근평균득점', '최근평균타율', '최근평균출루율', '최근평균장타율',
                '상대팀최근승률', '상대팀최근평균득점', '주말', '월']
    
    # 5. 데이터 분할
    X = df[features]
    y = (df['승리여부'] == '승').astype(int)
    
    tscv = TimeSeriesSplit(n_splits=5)
    splits = list(tscv.split(X))
    train_index, test_index = splits[-1]
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 6. 스케일링
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 7. 모델 학습
    xgb_model, lgb_model = train_models(X_train_scaled, y_train)
    
    # 8. 예측
    xgb_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
    lgb_pred_proba = lgb_model.predict(X_test_scaled)
    
    # 9. 앙상블
    ensemble_pred_proba = 0.6 * xgb_pred_proba + 0.4 * lgb_pred_proba
    ensemble_predictions = (ensemble_pred_proba > 0.5).astype(int)
    
    # 10. 성능 평가
    accuracy = accuracy_score(y_test, ensemble_predictions)
    auc = roc_auc_score(y_test, ensemble_pred_proba)
    
    logging.info(f'\n앙상블 모델 정확도: {accuracy:.4f}')
    logging.info(f'앙상블 모델 AUC: {auc:.4f}')
    
    # 11. 특성 중요도 시각화
    visualize_feature_importance(xgb_model, features, 'feature_importance.png')
    
    # 12. 결과 저장
    results = pd.DataFrame({
        '실제값': y_test,
        '예측값': ensemble_predictions,
        '예측확률': ensemble_pred_proba
    })
    results.to_csv('prediction_results.csv', index=False, encoding='utf-8-sig')
    
    logging.info("\n모든 과정이 완료되었습니다.")
    
except Exception as e:
    logging.error(f"\n오류 발생: {str(e)}")
    logging.error("\n상세 오류 정보:")
    logging.error(traceback.format_exc())

2025-05-15 10:12:50,589 - INFO - 데이터 로드 시작...
2025-05-15 10:12:52,652 - INFO - 데이터 로드 완료. 데이터 크기: (14682, 22)
2025-05-15 10:12:52,653 - INFO - 컬럼 목록: ['날짜', '연도', '경기장', '팀명', '홈/원정', '타수', '안타', '득점', '2루타', '3루타', '홈런', '볼넷', '사구', '희생플라이', '타점', '병살타', '폭투', '관중수', '타율', '출루율', '장타율', '단타율']
2025-05-15 10:12:52,654 - INFO - 데이터 기본 정보:
2025-05-15 10:12:52,655 - INFO - 데이터 타입:
날짜       datetime64[ns]
연도                int64
경기장              object
팀명               object
홈/원정             object
타수                int64
안타                int64
득점                int64
2루타               int64
3루타               int64
홈런                int64
볼넷                int64
사구                int64
희생플라이             int64
타점                int64
병살타               int64
폭투                int64
관중수               int64
타율              float64
출루율             float64
장타율             float64
단타율             float64
dtype: object
2025-05-15 10:12:52,660 - INFO - 결측치 개수:
날짜       0
연도       0
경기장      0
팀명 

In [29]:
df_origin.head(20)

Unnamed: 0,날짜,연도,경기장,팀명,홈/원정,타수,안타,득점,2루타,3루타,홈런,볼넷,사구,희생플라이,타점,병살타,폭투,관중수,타율,출루율,장타율,단타율
0,2015-03-28,2015,대구,삼성,홈,38,13,6,3,0,0,4,0,0,6,0,0,10000,0.342,0.405,0.421,0.263
1,2015-03-28,2015,대구,SK,원정,31,5,1,1,1,0,5,0,0,1,1,1,10000,0.161,0.278,0.258,0.097
2,2015-03-28,2015,목동,넥센,홈,40,8,5,2,0,2,6,0,0,5,1,0,12500,0.2,0.304,0.4,0.1
3,2015-03-28,2015,목동,한화,원정,42,10,4,2,0,0,7,1,1,4,0,1,12500,0.238,0.353,0.286,0.19
4,2015-03-28,2015,잠실,두산,홈,35,12,9,0,1,2,5,0,2,9,0,0,21746,0.343,0.405,0.571,0.257
5,2015-03-28,2015,잠실,NC,원정,31,9,4,0,0,0,4,1,2,4,1,1,21746,0.29,0.368,0.29,0.29
6,2015-03-28,2015,광주,KIA,홈,30,8,3,1,1,1,3,0,1,3,1,1,22000,0.267,0.324,0.467,0.167
7,2015-03-28,2015,광주,LG,원정,29,7,1,2,1,0,4,0,0,1,2,0,22000,0.241,0.333,0.379,0.138
8,2015-03-28,2015,사직,롯데,홈,37,14,12,5,0,2,3,1,0,12,1,0,27500,0.378,0.439,0.676,0.189
9,2015-03-28,2015,사직,KT,원정,35,14,9,1,0,2,8,1,1,8,1,1,27500,0.4,0.511,0.6,0.314


In [32]:
X.head(20)

Unnamed: 0,팀명_인코딩,상대팀_인코딩,경기장_인코딩,홈/원정_인코딩,최근승률,최근평균득점,최근평균타율,최근평균출루율,최근평균장타율,상대팀최근승률,상대팀최근평균득점,주말,월
0,8,4,3,1,1.0,6.0,0.342105,0.404762,0.5,0.0,1.0,1,3
1,4,8,3,0,0.0,1.0,0.16129,0.277778,0.322581,1.0,6.0,1,3
2,10,9,2,1,1.0,5.0,0.2,0.304348,0.5,0.0,4.0,1,3
3,9,10,2,0,0.0,4.0,0.238095,0.36,0.333333,1.0,5.0,1,3
4,6,3,8,1,1.0,9.0,0.342857,0.425,0.657143,0.0,4.0,1,3
5,3,6,8,0,0.0,4.0,0.290323,0.388889,0.290323,1.0,9.0,1,3
6,0,2,1,1,1.0,3.0,0.266667,0.333333,0.566667,0.0,1.0,1,3
7,2,0,1,0,0.0,1.0,0.241379,0.333333,0.482759,1.0,3.0,1,3
8,7,1,6,1,1.0,12.0,0.378378,0.439024,0.864865,0.0,9.0,1,3
9,1,7,6,0,0.0,9.0,0.4,0.522727,0.685714,1.0,12.0,1,3


In [34]:
y.head(20)

0     1
1     0
2     1
3     0
4     1
5     0
6     1
7     0
8     1
9     0
10    1
11    0
12    0
13    1
14    0
15    1
16    1
17    0
18    1
19    0
Name: 승리여부, dtype: int64

In [16]:
# 모든 열을 표시하도록 설정
pd.set_option('display.max_columns', None)  # 모든 열 표시
pd.set_option('display.width', None)        # 출력 너비 제한 해제
pd.set_option('display.max_rows', 50)     # 모든 행 표시

In [17]:
df

Unnamed: 0,날짜,연도,경기장,팀명,홈/원정,타수,안타,득점,2루타,3루타,홈런,볼넷,사구,희생플라이,타점,병살타,폭투,관중수,타율,출루율,장타율,단타율,경기ID,상대팀,루타,승리여부,최근승률,최근평균득점,최근평균타율,최근평균출루율,최근평균장타율,상대팀최근승률,상대팀최근평균득점,상대팀최근평균타율,상대팀최근평균출루율,상대팀최근평균장타율,주말,월,팀명_인코딩,상대팀_인코딩,경기장_인코딩,홈/원정_인코딩
0,2015-03-28,2015,대구,삼성,홈,38,13,6,3,0,0,4,0,0,6,0,0,10000,0.342,0.405,0.421,0.263,1,SSG,19,승,1.0,6.0,0.342105,0.404762,0.500000,0.0,1.0,0.161290,0.277778,0.322581,1,3,8,4,3,1
1,2015-03-28,2015,대구,SSG,원정,31,5,1,1,1,0,5,0,0,1,1,1,10000,0.161,0.278,0.258,0.097,1,삼성,10,패,0.0,1.0,0.161290,0.277778,0.322581,1.0,6.0,0.342105,0.404762,0.500000,1,3,4,8,3,0
2,2015-03-28,2015,기타,히어로즈,홈,40,8,5,2,0,2,6,0,0,5,1,0,12500,0.200,0.304,0.400,0.100,2,한화,20,승,1.0,5.0,0.200000,0.304348,0.500000,0.0,4.0,0.238095,0.360000,0.333333,1,3,10,9,2,1
3,2015-03-28,2015,기타,한화,원정,42,10,4,2,0,0,7,1,1,4,0,1,12500,0.238,0.353,0.286,0.190,2,히어로즈,14,패,0.0,4.0,0.238095,0.360000,0.333333,1.0,5.0,0.200000,0.304348,0.500000,1,3,9,10,2,0
4,2015-03-28,2015,잠실,두산,홈,35,12,9,0,1,2,5,0,2,9,0,0,21746,0.343,0.405,0.571,0.257,4,NC,23,승,1.0,9.0,0.342857,0.425000,0.657143,0.0,4.0,0.290323,0.388889,0.290323,1,3,6,3,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15233,2025-04-27,2025,광주,LG,원정,30,5,2,1,0,0,5,0,0,1,2,0,20500,0.167,0.286,0.200,0.133,7259,KIA,7,패,0.2,4.3,0.235650,0.344560,0.422961,0.5,4.3,0.254491,0.330645,0.497006,1,4,2,0,1,0
15234,2025-04-27,2025,잠실,두산,홈,34,13,13,4,0,1,10,1,1,13,2,0,23750,0.382,0.522,0.588,0.235,7263,롯데,25,패,0.5,6.2,0.292754,0.390000,0.544928,0.5,5.6,0.303371,0.360825,0.525281,1,4,6,7,8,1
15235,2025-04-27,2025,잠실,롯데,원정,32,7,4,3,0,1,3,1,0,4,2,1,23750,0.219,0.306,0.406,0.094,7263,두산,17,승,0.5,5.6,0.303371,0.360825,0.525281,0.5,6.2,0.292754,0.390000,0.544928,1,4,7,6,8,0
15236,2025-04-27,2025,대구,삼성,홈,38,14,8,1,0,3,5,0,0,8,0,0,24000,0.368,0.442,0.632,0.263,7260,NC,28,승,0.4,7.3,0.331461,0.422330,0.632022,0.7,3.6,0.206452,0.301136,0.406452,1,4,8,3,3,1


In [18]:
X

Unnamed: 0,팀명_인코딩,상대팀_인코딩,경기장_인코딩,홈/원정_인코딩,최근승률,최근평균득점,최근평균타율,최근평균출루율,최근평균장타율,상대팀최근승률,상대팀최근평균득점,주말,월
0,8,4,3,1,1.0,6.0,0.342105,0.404762,0.500000,0.0,1.0,1,3
1,4,8,3,0,0.0,1.0,0.161290,0.277778,0.322581,1.0,6.0,1,3
2,10,9,2,1,1.0,5.0,0.200000,0.304348,0.500000,0.0,4.0,1,3
3,9,10,2,0,0.0,4.0,0.238095,0.360000,0.333333,1.0,5.0,1,3
4,6,3,8,1,1.0,9.0,0.342857,0.425000,0.657143,0.0,4.0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15233,2,0,1,0,0.2,4.3,0.235650,0.344560,0.422961,0.5,4.3,1,4
15234,6,7,8,1,0.5,6.2,0.292754,0.390000,0.544928,0.5,5.6,1,4
15235,7,6,8,0,0.5,5.6,0.303371,0.360825,0.525281,0.5,6.2,1,4
15236,8,3,3,1,0.4,7.3,0.331461,0.422330,0.632022,0.7,3.6,1,4


In [1]:
!pip install xgboost

