In [3]:
# 필요한 패키지 불러오기
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
# 데이터 저장 경로 설정
DATA_PATH = '/tmp/airflow_diabetes'
os.makedirs(DATA_PATH, exist_ok=True)

In [7]:
# 1️⃣ Feature Engineering
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

rf_selector = RandomForestRegressor()
rf_selector.fit(X, y)
importances = rf_selector.feature_importances_
top_indices = np.argsort(importances)[-4:]

X_selected = X.iloc[:, top_indices]
df_selected = pd.concat([X_selected, pd.Series(y, name='target')], axis=1)
df_selected.to_csv(os.path.join(DATA_PATH, 'selected_features.csv'), index=False)

print("✅ Feature Engineering 완료")
print("선택된 features:", X_selected.columns.tolist())
print(df_selected.head())

✅ Feature Engineering 완료
선택된 features: ['s6', 'bp', 'bmi', 's5']
         s6        bp       bmi        s5  target
0 -0.017646  0.021872  0.061696  0.019907   151.0
1 -0.092204 -0.026328 -0.051474 -0.068332    75.0
2 -0.025930 -0.005670  0.044451  0.002861   141.0
3 -0.009362 -0.036656 -0.011595  0.022688   206.0
4 -0.046641  0.021872 -0.036385 -0.031988   135.0


In [9]:
# 2️⃣ 모델 학습
X = df_selected.drop('target', axis=1)
y = df_selected['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf_model = RandomForestRegressor()
gb_model = GradientBoostingRegressor()

rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

pickle.dump((rf_model, X_test, y_test), open(os.path.join(DATA_PATH, 'rf_model.pkl'), 'wb'))
pickle.dump((gb_model, X_test, y_test), open(os.path.join(DATA_PATH, 'gb_model.pkl'), 'wb'))

print("\n✅ 모델 학습 완료")



✅ 모델 학습 완료


In [11]:
pip install -U scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [13]:
from sklearn.metrics import mean_squared_error
import numpy as np


In [19]:
# 3️⃣ 모델 평가 및 선택

rf_model, X_test_rf, y_test_rf = pickle.load(open(os.path.join(DATA_PATH, 'rf_model.pkl'), 'rb'))
gb_model, _, _ = pickle.load(open(os.path.join(DATA_PATH, 'gb_model.pkl'), 'rb'))

# rf_rmse = mean_squared_error(y_test_rf, rf_model.predict(X_test_rf), squared=False)
# gb_rmse = mean_squared_error(y_test_rf, gb_model.predict(X_test_rf), squared=False)


rf_mse = mean_squared_error(y_test_rf, rf_model.predict(X_test_rf))
rf_rmse = np.sqrt(rf_mse)

gb_mse = mean_squared_error(y_test_rf, gb_model.predict(X_test_rf))
gb_rmse = np.sqrt(gb_mse)
best_model = rf_model if rf_rmse < gb_rmse else gb_model
with open(os.path.join(DATA_PATH, 'best_model.pkl'), 'wb') as f:
    pickle.dump(best_model, f)

print("\n✅ 모델 비교 완료")
print(f"Random Forest RMSE: {rf_rmse:.4f}")
print(f"Gradient Boosting RMSE: {gb_rmse:.4f}")
print("선택된 모델:", "Random Forest" if best_model == rf_model else "Gradient Boosting")



✅ 모델 비교 완료
Random Forest RMSE: 60.8759
Gradient Boosting RMSE: 58.6012
선택된 모델: Gradient Boosting


In [21]:

# 4️⃣ 추론 (Inference)
df_infer = pd.read_csv(os.path.join(DATA_PATH, 'selected_features.csv')).drop('target', axis=1)
model = pickle.load(open(os.path.join(DATA_PATH, 'best_model.pkl'), 'rb'))
preds = model.predict(df_infer)

df_preds = pd.DataFrame(preds, columns=['prediction'])
df_preds.to_csv(os.path.join(DATA_PATH, 'inference.csv'), index=False)

print("\n✅ 추론 완료")
print(df_preds.head())



✅ 추론 완료
   prediction
0  193.199892
1   86.014155
2  167.469780
3  181.934918
4   99.754090
