In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,  mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV

import seaborn as sns

data = pd.read_csv("0627_PCE_metod1.csv")



train_data, test_data = train_test_split(data, test_size=0.2,random_state=42)
x_train = train_data.select_dtypes(include=['number']).drop(['PCE'], axis=1)
y_train = train_data['PCE']
x_test = test_data.select_dtypes(include=['number']).drop(['PCE'], axis=1)
y_true = test_data['PCE']  # 테스트 데이터의 실제 값

import xgboost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
x_trainScaled_np = scaler.fit_transform(x_train)
x_train_scaled = pd.DataFrame(x_trainScaled_np, index=x_train.index, columns=x_train.columns)
x_test_scaled = scaler.transform(x_test)


param_grid = {
    "max_depth": [3, 4, 5, 6],
    "n_estimators": [500, 600, 700],
    "learning_rate": [0.01, 0.015]
}

search = GridSearchCV(XGBRegressor(eval_metric='rmsle'), param_grid, cv=5, n_jobs=-1)
search.fit(x_train_scaled, y_train)


regressor = XGBRegressor(**search.best_params_, eval_metric='rmsle')
regressor.fit(x_train_scaled, y_train)

cross_val_scores = cross_val_score(regressor, x_train_scaled, y_train, cv=5, scoring='r2')


print(f"Cross-validation R-squared scores: {cross_val_scores}")
print(f"Mean R-squared: {np.mean(cross_val_scores):.5f}")
print(f"Standard deviation: {np.std(cross_val_scores):.5f}")


train_predictions = regressor.predict(x_train_scaled)
predictions = regressor.predict(x_test_scaled)


train_r2 = r2_score(y_train, train_predictions)
train_mae = mean_absolute_error(y_train, train_predictions)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_r2 = r2_score(y_true, predictions)
test_mae = mean_absolute_error(y_true, predictions)
test_rmse = np.sqrt(mean_squared_error(y_true, predictions))




print(f"Training R-squared: {train_r2:.5f}")
print(f"Training Mean Absolute Error: {train_mae:.5f},RMSE: {train_rmse:.5f}")
print(f"Test R-squared: {test_r2:.5f}")
print(f"Test Mean Absolute Error: {test_mae:.5f}, RMSE: {test_rmse:.5f}")


Cross-validation R-squared scores: [0.76513119 0.77011364 0.81779468 0.83786625 0.86241096]
Mean R-squared: 0.81066
Standard deviation: 0.03791
Training R-squared: 0.88675
Training Mean Absolute Error: 1.00298,RMSE: 1.57333
Test R-squared: 0.86474
Test Mean Absolute Error: 1.14529, RMSE: 1.67926
