In [2]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 디렉토리 생성 함수
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


In [4]:
# CSV 파일 경로 설정
UPLOAD_FOLDER = './uploads'
# CSV 파일 경로 설정
csv_filename = './uploads/24-learning.csv'
target_column = "Target"  # 타겟 컬럼 이름

try:
    df = pd.read_csv(csv_filename)
    print("CSV 데이터 미리보기:")
    print(df.head())
except Exception as e:
    print(f"CSV 파일 읽기 오류: {str(e)}")
    raise

# 타겟 컬럼 확인
if target_column not in df.columns:
    raise ValueError(f"Target 컬럼 '{target_column}'이 존재하지 않습니다.")

# 결측값 처리
df = df.fillna(0)

# 입력과 타겟 데이터 분리
X = df.drop(columns=[target_column])
y = df[target_column]

# 스케일러 생성
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# 스케일링 적용
X_scaled = scaler_X.fit_transform(X.to_numpy())
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1, 1)).flatten()

# 데이터 분할
val_ratio = 0.2
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=val_ratio, random_state=42)

print(f"학습 데이터 크기: {X_train.shape}, 검증 데이터 크기: {X_val.shape}")



CSV 데이터 미리보기:
   Target  att1  att2  att3  att4  att5  att6  att7  att8  att9  att10  att11  \
0     335   300    20    40   0.1   100     1    10  27.2  38.0   1800    0.5   
1     413   200    20    40   0.1   130     1    10  26.6  44.0   1740    0.5   
2     424   200    20    40   0.1   130     1    10  26.6  44.0   1800    0.5   
3     516   250    20    45   0.1   110     1    10  24.4  35.0   1800    0.5   
4     536   250    20    30   0.1   110     1    10  24.4  35.0   1800    0.5   

   att12  att13  att14  att15  
0  9.286    100   2700   7200  
1  9.286    100   2700   7200  
2  9.286    100   2700   7200  
3  9.286    100   2700   7200  
4  9.286    100   2700   7200  
학습 데이터 크기: (99, 15), 검증 데이터 크기: (25, 15)


In [5]:
# 모델 학습 및 저장 함수
def train_and_save_model(model, model_name, X_train, y_train, X_val, y_val):
    # 모델 학습
    print(f"모델 학습 시작: {model_name}")
    model.fit(X_train, y_train)

    # 예측
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_val)

    # 평가
    train_mse = mean_squared_error(y_train, train_predictions)
    train_r2 = r2_score(y_train, train_predictions)
    val_mse = mean_squared_error(y_val, val_predictions)
    val_r2 = r2_score(y_val, val_predictions)

    print(f"[{model_name}] 학습 완료")
    print(f"훈련 MSE: {train_mse:.4f}, R2: {train_r2:.4f}")
    print(f"검증 MSE: {val_mse:.4f}, R2: {val_r2:.4f}")

    # 모델 저장 경로
    save_dir = './saved_models'
    ensure_dir(save_dir)
    model_path = os.path.join(save_dir, f"{model_name}.pkl")

    # 모델 저장
    try:
        joblib.dump(model, model_path)
        print(f"모델 저장 완료: {model_path}")
    except Exception as e:
        print(f"모델 저장 실패: {str(e)}")

    return {
        'model_name': model_name,
        'train_mse': train_mse,
        'train_r2': train_r2,
        'val_mse': val_mse,
        'val_r2': val_r2,
        'model_path': model_path
    }


In [6]:
# 모델 정의
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingRegressor(n_estimators=100, random_state=42)

# 랜덤포레스트 학습 및 저장
rf_results = train_and_save_model(random_forest, "RandomForestRegressor", X_train, y_train, X_val, y_val)

# Gradient Boosting Regressor 학습 및 저장
gb_results = train_and_save_model(gradient_boosting, "GradientBoostingRegressor", X_train, y_train, X_val, y_val)


모델 학습 시작: RandomForestRegressor
[RandomForestRegressor] 학습 완료
훈련 MSE: 0.0010, R2: 0.9885
검증 MSE: 0.0062, R2: 0.9363
모델 저장 완료: ./saved_models/RandomForestRegressor.pkl
모델 학습 시작: GradientBoostingRegressor
[GradientBoostingRegressor] 학습 완료
훈련 MSE: 0.0006, R2: 0.9937
검증 MSE: 0.0064, R2: 0.9348
모델 저장 완료: ./saved_models/GradientBoostingRegressor.pkl


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 데이터 생성
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 설정
model = GradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.1,
    n_estimators=100,
    subsample=1,
    criterion="friedman_mse",
    min_samples_split=2,
    min_samples_leaf=1
)

# 모델 학습
model.fit(X_train, y_train)

# 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"테스트 MSE: {mse:.4f}")


In [None]:
X.shape

In [None]:
y.shape