In [14]:
# 1. 데이터 탐색 및 시각화

import pandas as pd

# 데이터셋 로드
file_path = 'machine.data_update.csv'
data = pd.read_csv(file_path, header=None)

data.head()

# 2. 데이터 전처리

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score

# 컬럼 이름 할당 (UCI Machine Learning Repository 문서 참조)
column_names = ["VendorName", "ModelName", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"]
data.columns = column_names

# 'ERP' 열 제거 (예측에 사용하지 않음)
data = data.drop(columns=['ERP'])

# 특성과 목표 변수 분리
X = data.drop(columns=['PRP'])
y = data['PRP']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 전처리 파이프라인 정의
numerical_features = ["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX"]
categorical_features = ["VendorName", "ModelName"]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 모델 파이프라인 정의
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 전처리 파이프라인 구조 확인
print(pipeline)

# 3. 모델 파이프라인 및 하이퍼파라미터 튜닝

from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor와 함께 모델 파이프라인 정의
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# GridSearchCV를 위한 파라미터 그리드 정의
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# GridSearchCV 실행
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# GridSearchCV에서 찾은 최적의 파라미터 출력
best_params = grid_search.best_params_
print(best_params)

# 4. 모델 평가

# GridSearchCV에서 찾은 최적의 모델 가져오기
best_model = grid_search.best_estimator_

# 훈련 데이터 평가
y_train_pred = best_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# 테스트 데이터 평가
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# 평가 결과 출력
train_results = {
    'MSE': train_mse,
    'MAE': train_mae,
    'R2': train_r2
}

test_results = {
    'MSE': test_mse,
    'MAE': test_mae,
    'R2': test_r2
}

print(train_results, test_results)

# 5. 모델 개선

from sklearn.ensemble import GradientBoostingRegressor

# GradientBoostingRegressor와 함께 모델 파이프라인 정의
pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# GridSearchCV를 위한 파라미터 그리드 정의
param_grid_gb = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# GridSearchCV 실행
grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train, y_train)

# GridSearchCV에서 찾은 최적의 모델 가져오기
best_model_gb = grid_search_gb.best_estimator_

# 훈련 데이터 평가
y_train_pred_gb = best_model_gb.predict(X_train)
train_mse_gb = mean_squared_error(y_train, y_train_pred_gb)
train_mae_gb = mean_absolute_error(y_train, y_train_pred_gb)
train_r2_gb = r2_score(y_train, y_train_pred_gb)

# 테스트 데이터 평가
y_test_pred_gb = best_model_gb.predict(X_test)
test_mse_gb = mean_squared_error(y_test, y_test_pred_gb)
test_mae_gb = mean_absolute_error(y_test, y_test_pred_gb)
test_r2_gb = r2_score(y_test, y_test_pred_gb)

# 개선된 모델의 평가 결과 출력
train_results_gb = {
    'MSE': train_mse_gb,
    'MAE': train_mae_gb,
    'R2': train_r2_gb
}

test_results_gb = {
    'MSE': test_mse_gb,
    'MAE': test_mae_gb,
    'R2': test_r2_gb
}

print(train_results_gb, test_results_gb)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['MYCT', 'MMIN', 'MMAX',
                                                   'CACH', 'CHMIN', 'CHMAX']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_un

432 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jk026\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jk026\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\jk026\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
  

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'MYCT'