Task1_0730. 교차 검증을 위한 사용자 함수를 생성한 후 아래 회귀트리 모델들에 대한 교차검증을 수행하세요.
- dt_reg = DecisionTreeRegressor(random_state=0, max_depth=4)
- rf_reg = RandomForestRegressor(random_state=0, n_estimators=1000)
- gb_reg = GradientBoostingRegressor(random_state=0, n_estimators=1000)
- xgb_reg = XGBRegressor(n_estimators=1000)
- lgb_reg = LGBMRegressor(n_estimators=1000, verbose=-1)

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

boston = fetch_openml(name='Boston', version=1, parser='auto')
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df.drop(['CHAS','RAD'], axis=1, inplace=True)
boston_df['PRICE'] = boston.target

for col in boston_df.columns:
    if boston_df[col].dtype.name == 'category':
        boston_df[col] = boston_df[col].cat.codes
        boston_df[col] = boston_df[col].astype(float)
        
X_data = boston_df.drop("PRICE", axis=1)
y_target = boston_df['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3, random_state=42)

model_lists = ['dt_reg', 'rf_reg', 'gb_reg', 'xgb_reg', 'lgb_reg']

for model in model_lists:
    if model == 'dt_reg':
        result = DecisionTreeRegressor(random_state=0, max_depth=4)
    elif model == 'rf_reg':
        result = RandomForestRegressor(random_state=0, n_estimators=1000)
    elif model == 'gb_reg':
        result = GradientBoostingRegressor(random_state=0, n_estimators=1000)
    elif model =='xgb_reg':
        result = XGBRegressor(n_estimators=1000)
    else:
        result = LGBMRegressor(n_estimators=1000, verbose=-1)
    
    result_reg = result.fit(X_train, y_train)
    pred_result = result_reg.predict(X_test)
    
    mse = mean_squared_error(y_test, pred_result)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, pred_result)
    print(f'Mean Squared Error: {mse:.4f}')
    print(f'Root Mean Squared Error: {rmse:.4f}')
    print(f'R^2 Score: {r2:.4f}')




Mean Squared Error: 20.4499
Root Mean Squared Error: 4.5222
R^2 Score: 0.7256
Mean Squared Error: 9.3171
Root Mean Squared Error: 3.0524
R^2 Score: 0.8750
Mean Squared Error: 7.9282
Root Mean Squared Error: 2.8157
R^2 Score: 0.8936
Mean Squared Error: 9.5584
Root Mean Squared Error: 3.0917
R^2 Score: 0.8717
Mean Squared Error: 10.3390
Root Mean Squared Error: 3.2154
R^2 Score: 0.8612


In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

boston = fetch_openml(name="Boston", version=1, parser="auto")
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df.drop(["CHAS", "RAD"], axis=1, inplace=True)
boston_df["PRICE"] = boston.target

for col in boston_df.columns:
    if boston_df[col].dtype.name == "category":
        boston_df[col] = boston_df[col].cat.codes
        boston_df[col] = boston_df[col].astype(float)

X_data = boston_df.drop("PRICE", axis=1)
y_target = boston_df["PRICE"]

X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_target, test_size=0.3, random_state=42
)

model_lists = ["dt_reg", "rf_reg", "gb_reg", "xgb_reg", "lgb_reg"]

def reg_eval(model_lists):
    for model in model_lists:
        if model == "dt_reg":
            result = DecisionTreeRegressor(random_state=0, max_depth=4)
        elif model == "rf_reg":
            result = RandomForestRegressor(random_state=0, n_estimators=1000)
        elif model == "gb_reg":
            result = GradientBoostingRegressor(random_state=0, n_estimators=1000)
        elif model == "xgb_reg":
            result = XGBRegressor(n_estimators=1000)
        else:
            result = LGBMRegressor(n_estimators=1000, verbose=-1)

        result_reg = result.fit(X_train, y_train)
        pred_result = result_reg.predict(X_test)

        mse = mean_squared_error(y_test, pred_result)
        rmse = sqrt(mse)
        r2 = r2_score(y_test, pred_result)
        print(f'{model} 평가')
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Root Mean Squared Error: {rmse:.4f}")
        print(f"R^2 Score: {r2:.4f}\n")

reg_eval(model_lists)

dt_reg 평가
Mean Squared Error: 20.4499
Root Mean Squared Error: 4.5222
R^2 Score: 0.7256

rf_reg 평가
Mean Squared Error: 9.3171
Root Mean Squared Error: 3.0524
R^2 Score: 0.8750

gb_reg 평가
Mean Squared Error: 7.9282
Root Mean Squared Error: 2.8157
R^2 Score: 0.8936

xgb_reg 평가
Mean Squared Error: 9.5584
Root Mean Squared Error: 3.0917
R^2 Score: 0.8717

lgb_reg 평가
Mean Squared Error: 10.3390
Root Mean Squared Error: 3.2154
R^2 Score: 0.8612



Task2_0730. iris 데이터셋에 대하여 랜덤포레스트로 학습 및 평가한 결과와 차원축소한 후 c1,c2를 적용하여 학습 평가한 결과를 비교하셍요. (단 cross_val_score를 적용하여 평가)

In [13]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

iris = load_iris()

df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['target'] = iris.target
X = df_iris.drop("target", axis=1)
y = df_iris['target']
iris_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(iris_scaled, y, test_size=0.3, random_state=42)

# random forest eval
rf = RandomForestRegressor(n_estimators=1000, random_state=0)
neg_mse_scores = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print("5 folds의 개별 Negative MSE scores:", np.round(neg_mse_scores, 3))
print("5 folds의 개별 RMSE scores:", np.round(rmse_scores, 3))
print(f"5 folds의 평균 RMSE: {avg_rmse:.3f}")

5 folds의 개별 Negative MSE scores: [-0.02  -0.054 -0.109 -0.003 -0.053]
5 folds의 개별 RMSE scores: [0.141 0.233 0.33  0.052 0.229]
5 folds의 평균 RMSE: 0.197


In [15]:
# decomposition

from sklearn.datasets import load_iris
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

iris = load_iris()

df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris["target"] = iris.target
X = df_iris.drop("target", axis=1)
y = df_iris["target"]
iris_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(iris_scaled)
X_train, X_test, y_train, y_test = train_test_split(
    iris_pca, y, test_size=0.3, random_state=42
)

# random forest eval
rf = RandomForestRegressor(n_estimators=1000, random_state=0)
neg_mse_scores = cross_val_score(
    rf, X_train, y_train, scoring="neg_mean_squared_error", cv=5
)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print("5 folds의 개별 Negative MSE scores:", np.round(neg_mse_scores, 3))
print("5 folds의 개별 RMSE scores:", np.round(rmse_scores, 3))
print(f"5 folds의 평균 RMSE: {avg_rmse:.3f}")

5 folds의 개별 Negative MSE scores: [-0.052 -0.103 -0.057 -0.078 -0.133]
5 folds의 개별 RMSE scores: [0.229 0.322 0.238 0.28  0.365]
5 folds의 평균 RMSE: 0.287


Task3_0730. 상관도가 높은 BILL_AMT1 ~ BILL_AMT6 까지 6개의 속성에 대하여 2개의 컴포넌트로 PCA 변환하고 변동성을 알아보기 위하여 explained_variance_ratio_ 계산하세요.

In [39]:
import pandas as pd

df = pd.read_excel(
    r"D:\kdt_240424\workspace\M5_ML\data\credit_card.xls", header=1, sheet_name="Data"
)
df.rename(
    columns={"PAY_0": "PAY_1", "default payment next month": "default"}, inplace=True
)
y_target = df["default"]
X_features = df.drop(["default", "ID"], axis=1)
X_features

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,0,188948,192815,208365,88004,31237,15980,8500,20000,5003,3047,5000,1000
29996,150000,1,3,2,43,-1,-1,-1,-1,0,0,1683,1828,3502,8979,5190,0,1837,3526,8998,129,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,0,3565,3356,2758,20878,20582,19357,0,0,22000,4200,2000,3100
29998,80000,1,3,1,41,1,-1,0,0,0,-1,-1645,78379,76304,52774,11855,48944,85900,3409,1178,1926,52964,1804


In [40]:
X_features_pca = X_features[
    [
        "BILL_AMT1",
        "BILL_AMT2",
        "BILL_AMT3",
        "BILL_AMT4",
        "BILL_AMT5",
        "BILL_AMT6",
    ]
]
X_features_pca

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
0,3913,3102,689,0,0,0
1,2682,1725,2682,3272,3455,3261
2,29239,14027,13559,14331,14948,15549
3,46990,48233,49291,28314,28959,29547
4,8617,5670,35835,20940,19146,19131
...,...,...,...,...,...,...
29995,188948,192815,208365,88004,31237,15980
29996,1683,1828,3502,8979,5190,0
29997,3565,3356,2758,20878,20582,19357
29998,-1645,78379,76304,52774,11855,48944


In [41]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_features_pca = pca.fit_transform(X_features_pca)

In [42]:
X_features.drop(
    [
        "BILL_AMT1",
        "BILL_AMT2",
        "BILL_AMT3",
        "BILL_AMT4",
        "BILL_AMT5",
        "BILL_AMT6",
    ], axis=1, inplace=True
)

In [43]:
X_features_pca = pd.DataFrame(X_features_pca)
X_features_pca

Unnamed: 0,0,1
0,-107289.378431,1830.698021
1,-103860.702650,-3334.488402
2,-68874.738469,2603.127512
3,-14775.024768,11322.393759
4,-66777.116291,-15897.814429
...,...,...
29995,200794.455046,148424.704969
29996,-102148.258619,-4429.879446
29997,-83686.979484,-22831.586379
29998,-1362.841368,-10490.953549


In [35]:
type(X_features_pca)

pandas.core.frame.DataFrame

In [45]:
pca.explained_variance_ratio_

array([0.9066601 , 0.05004773])

In [None]:
X_features

In [44]:
result_df = pd.concat([X_features, X_features_pca], axis=1)
result_df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,0,1
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,0,689,0,0,0,0,-107289.378431,1830.698021
1,120000,2,2,2,26,-1,2,0,0,0,2,0,1000,1000,1000,0,2000,-103860.702650,-3334.488402
2,90000,2,2,2,34,0,0,0,0,0,0,1518,1500,1000,1000,1000,5000,-68874.738469,2603.127512
3,50000,2,2,1,37,0,0,0,0,0,0,2000,2019,1200,1100,1069,1000,-14775.024768,11322.393759
4,50000,1,2,1,57,-1,0,-1,0,0,0,2000,36681,10000,9000,689,679,-66777.116291,-15897.814429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,0,8500,20000,5003,3047,5000,1000,200794.455046,148424.704969
29996,150000,1,3,2,43,-1,-1,-1,-1,0,0,1837,3526,8998,129,0,0,-102148.258619,-4429.879446
29997,30000,1,2,2,37,4,3,2,-1,0,0,0,0,22000,4200,2000,3100,-83686.979484,-22831.586379
29998,80000,1,3,1,41,1,-1,0,0,0,-1,85900,3409,1178,1926,52964,1804,-1362.841368,-10490.953549


Task4_0730. 신용카드 데이터셋 전체 23개 속성에 대하여 6개의 컴포넌트를 가진 PCA 변환을 수행하고 모델은 RF, cv=3, scoring='accuracy'을 적용하여 cross_val_score()로 분류 예측 수행하세요.

In [47]:
import warnings
import pandas as pd

# FutureWarning 경고 메시지를 무시하도록 설정
warnings.simplefilter(action="ignore", category=FutureWarning)

In [50]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from math import sqrt

df = pd.read_excel(
    r"D:\kdt_240424\workspace\M5_ML\data\credit_card.xls", header=1, sheet_name="Data"
)
df.rename(
    columns={"PAY_0": "PAY_1", "default payment next month": "default"}, inplace=True
)
y_target = df["default"]
X_features = df.drop(["default", "ID"], axis=1)

pca = PCA(n_components=6)
X_features_pca = pca.fit_transform(X_features)
X_features_pca = pd.DataFrame(X_features_pca)
X_train, X_test, y_train, y_test = train_test_split(X_features_pca, y_target, test_size=0.3, random_state=42)

rf = RandomForestRegressor(n_estimators=1000, random_state=0)
neg_mse_scores = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=3)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print("3 folds의 개별 Negative MSE scores:", np.round(neg_mse_scores, 3))
print("3 folds의 개별 RMSE scores:", np.round(rmse_scores, 3))
print(f"3 folds의 평균 RMSE: {avg_rmse:.3f}")

Traceback (most recent call last):
  File "d:\kdt_240424\workspace\M5_ML\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
  File "d:\kdt_240424\workspace\M5_ML\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "d:\kdt_240424\workspace\M5_ML\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "d:\kdt_240424\workspace\M5_ML\venv\lib\site-packages\sklearn\metrics\_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "d:\kdt_240424\workspace\M5_ML\venv\lib\site-packages\sklearn\metrics\_classification.py", line 112, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and continuous targets

Traceback (most recent call last):
  File "d:\kdt_240424\workspace\

KeyboardInterrupt: 

In [54]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from math import sqrt

df = pd.read_excel(
    r"D:\kdt_240424\workspace\M5_ML\data\credit_card.xls", header=1, sheet_name="Data"
)
df.rename(
    columns={"PAY_0": "PAY_1", "default payment next month": "default"}, inplace=True
)
y_target = df["default"]
X_features = df.drop(["default", "ID"], axis=1)

pca = PCA(n_components=6)
X_features_pca = pca.fit_transform(X_features)
X_train, X_test, y_train, y_test = train_test_split(
    X_features_pca, y_target, test_size=0.3, random_state=42
)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, random_state=0)
accuracy_scores = cross_val_score(rf, X_train, y_train, scoring="accuracy", cv=3)

print("3 folds의 개별 Accuracy scores:", np.round(accuracy_scores, 3))
print(f"3 folds의 평균 Accuracy: {np.mean(accuracy_scores):.3f}")

3 folds의 개별 Accuracy scores: [0.776 0.779 0.776]
3 folds의 평균 Accuracy: 0.777
