In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from scipy.stats import uniform

# 데이터 불러오기
df = pd.read_csv('machine.data_update.csv')

# 데이터 전처리
df = df.dropna()

X = df.drop('PRP', axis=1)
y = df['PRP']

X = pd.get_dummies(X, drop_first=True)

# 훈련 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 기본 모델: Linear Regression
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('model', LinearRegression())
])

linear_pipeline.fit(X_train, y_train)

y_train_pred_linear = linear_pipeline.predict(X_train)
y_test_pred_linear = linear_pipeline.predict(X_test)

mse_train_linear = mean_squared_error(y_train, y_train_pred_linear)
mae_train_linear = mean_absolute_error(y_train, y_train_pred_linear)
r2_train_linear = r2_score(y_train, y_train_pred_linear)

mse_test_linear = mean_squared_error(y_test, y_test_pred_linear)
mae_test_linear = mean_absolute_error(y_test, y_test_pred_linear)
r2_test_linear = r2_score(y_test, y_test_pred_linear)

cv_scores_linear = cross_val_score(linear_pipeline, X_train, y_train, cv=5, scoring='r2')

print("Linear Regression 모델 평가 완료")

# 개선 모델: Lasso Regression
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('model', Lasso(max_iter=10000))
])

param_dist_lasso = {
    'poly__degree': [1, 2],
    'model__alpha': uniform(0.001, 10)
}

random_search_lasso = RandomizedSearchCV(lasso_pipeline, param_distributions=param_dist_lasso, n_iter=10, cv=5, scoring='r2', n_jobs=-1, verbose=2, random_state=42)
random_search_lasso.fit(X_train, y_train)

print(f'Best Parameters: {random_search_lasso.best_params_}')
best_model_lasso = random_search_lasso.best_estimator_

y_train_pred_lasso = best_model_lasso.predict(X_train)
y_test_pred_lasso = best_model_lasso.predict(X_test)

mse_train_lasso = mean_squared_error(y_train, y_train_pred_lasso)
mae_train_lasso = mean_absolute_error(y_train, y_train_pred_lasso)
r2_train_lasso = r2_score(y_train, y_train_pred_lasso)

mse_test_lasso = mean_squared_error(y_test, y_test_pred_lasso)
mae_test_lasso = mean_absolute_error(y_test, y_test_pred_lasso)
r2_test_lasso = r2_score(y_test, y_test_pred_lasso)

cv_scores_lasso = cross_val_score(best_model_lasso, X_train, y_train, cv=5, scoring='r2')

print("Lasso Regression 모델 평가 완료")

# 성능 결과 데이터프레임 저장
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Linear Regression', 'Best Estimator', 'Best Estimator'],
    'Data': ['Train', 'Test', 'Train', 'Test'],
    'MSE': [mse_train_linear, mse_test_linear, mse_train_lasso, mse_test_lasso],
    'MAE': [mae_train_linear, mae_test_linear, mae_train_lasso, mae_test_lasso],
    'R^2': [r2_train_linear, r2_test_linear, r2_train_lasso, r2_test_lasso]
})

print(results)

# 교차 검증 결과 추가
cv_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Best Estimator'],
    'Data': ['Cross Validation', 'Cross Validation'],
    'MSE': [cv_scores_linear.mean(), cv_scores_lasso.mean()],
    'MAE': [cv_scores_linear.mean(), cv_scores_lasso.mean()],
    'R^2': [cv_scores_linear.mean(), cv_scores_lasso.mean()]
})

results = pd.concat([results, cv_results], ignore_index=True)
print(results)

# 성능 결과 시각화
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
sns.barplot(x='Model', y='MSE', hue='Data', data=results)
plt.title('MSE Comparison')

plt.subplot(1, 3, 2)
sns.barplot(x='Model', y='MAE', hue='Data', data=results)
plt.title('MAE Comparison')

plt.subplot(1, 3, 3)
sns.barplot(x='Model', y='R^2', hue='Data', data=results)
plt.title('R^2 Comparison')

plt.show()


Linear Regression 모델 평가 완료
Fitting 5 folds for each of 10 candidates, totalling 50 fits
