In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np

# 데이터프레임 불러오기
df = pd.read_csv('merged_df_240802.csv')

# 제외할 컬럼 정의
columns_to_exclude = [
    'Unnamed: 0.1', 'Unnamed: 0', 'matchId', 'gameType',
    'championName_x', 'item0', 'item1', 'item2', 'item3', 'item4',
    'item5', 'item6', 'participantId_x', 'teamId', 'teamPosition',
    'perk1', 'perk1_var1', 'perk1_var2', 'perk1_var3', 'perk2', 'perk2_var1',
    'perk2_var2', 'perk2_var3', 'perk3', 'perk3_var1', 'perk3_var2',
    'perk3_var3', 'perk4', 'perk4_var1', 'perk4_var2', 'perk4_var3',
    'perk5', 'perk5_var1', 'perk5_var2', 'perk5_var3', 'perk6',
    'perk6_var1', 'perk6_var2', 'perk6_var3', 'season',
    'CcontrolWardTimeCoverageInRiverOrEnemyHalf', 'cluster', 'teamcolor',
]

# 지정된 컬럼을 제외한 새로운 데이터프레임 생성
df_filtered = df.drop(columns=columns_to_exclude)

# 'damperdeath' 이후의 시계열 컬럼 식별
time_series_start_index = df_filtered.columns.get_loc('damperdeath') + 1
time_series_columns = df_filtered.columns[time_series_start_index:]

# 'position_'으로 시작하는 컬럼 제외
time_series_columns = [col for col in time_series_columns if not col.startswith('position_')]

# 10분 이하의 컬럼만 추출
time_series_columns_10min = [col for col in time_series_columns if int(col.split('_')[-1]) <= 10]

# 최종 데이터프레임 생성
final_columns = df_filtered.columns[:time_series_start_index].tolist() + time_series_columns_10min
df_final = df_filtered[final_columns]

# 타겟 변수와 특징 변수 정의
target = 'win_x'  # 'win_x'가 타겟 변수라고 가정
features = [col for col in df_final.columns if col != target]

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df_final[features], df_final[target], test_size=0.2,
                                                    random_state=42)
# StandardScaler 적용
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 모델 초기화
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
rf_model = RandomForestRegressor(n_estimators=200, max_depth=5, random_state=42)

# 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_train_errors = []
xgb_val_errors = []
rf_train_errors = []
rf_val_errors = []

print("XGBoost 모델 교차 검증 중...")
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    xgb_model.fit(X_train_fold, y_train_fold)
    y_train_pred = xgb_model.predict(X_train_fold)
    y_val_pred = xgb_model.predict(X_val_fold)

    xgb_train_errors.append(mean_squared_error(y_train_fold, y_train_pred))
    xgb_val_errors.append(mean_squared_error(y_val_fold, y_val_pred))

print("RandomForest 모델 교차 검증 중...")
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    rf_model.fit(X_train_fold, y_train_fold)
    y_train_pred = rf_model.predict(X_train_fold)
    y_val_pred = rf_model.predict(X_val_fold)

    rf_train_errors.append(mean_squared_error(y_train_fold, y_train_pred))
    rf_val_errors.append(mean_squared_error(y_val_fold, y_val_pred))


# 테스트 데이터에 대한 예측 및 성능 평가
xgb_model.fit(X_train, y_train)
xgb_test_predictions = xgb_model.predict(X_test)
xgb_test_mse = mean_squared_error(y_test, xgb_test_predictions)
xgb_test_r2 = r2_score(y_test, xgb_test_predictions)

rf_model.fit(X_train, y_train)
rf_test_predictions = rf_model.predict(X_test)
rf_test_mse = mean_squared_error(y_test, rf_test_predictions)
rf_test_r2 = r2_score(y_test, rf_test_predictions)

print(f"XGBoost 테스트 MSE: {xgb_test_mse}, R2: {xgb_test_r2}")
print(f"RandomForest 테스트 MSE: {rf_test_mse}, R2: {rf_test_r2}")

# 결과 그래프 그리기
folds = np.arange(1, kf.get_n_splits() + 1)

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(folds, xgb_train_errors, marker='o', label='train error')
plt.plot(folds, xgb_val_errors, marker='o', label='validation error')
plt.title('XGBoost 교차 검증 오류')
plt.xlabel('폴드')
plt.ylabel('평균 제곱 오차')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(folds, rf_train_errors, marker='o', label='train error')
plt.plot(folds, rf_val_errors, marker='o', label='validation error')
plt.title('RandomForest 교차 검증 오류')
plt.xlabel('폴드')
plt.ylabel('평균 제곱 오차')
plt.legend()

plt.tight_layout()
plt.show()

In [1]:
df.columns.to_list()

NameError: name 'df' is not defined