In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

In [2]:
# train 데이터 셋 5개 불러오기
df1 = pd.read_csv("../01_data/02_processed/sandbox/variants/X_train_score_all.csv")
df2 = pd.read_csv("../01_data/02_processed/sandbox/variants/X_train_score_structural.csv")
df3 = pd.read_csv("../01_data/02_processed/sandbox/variants/X_train_onehot_all.csv")
df4 = pd.read_csv("../01_data/02_processed/sandbox/variants/X_train_onehot_structural.csv")
df5 = pd.read_csv("../01_data/02_processed/sandbox/splits/y_train.csv")
df5.head()

Unnamed: 0,Delivery_Time
0,70
1,160
2,135
3,80
4,100


In [3]:
# df5의 차원을 (1,1) -> (1,)로 만들기
df5 = df5.squeeze()

# 여러 데이터셋을 딕셔너리로 정리
# 값은 (X, y) 튜플 X는 입력 변수, y는 타겟 변수
datasets1 = {"score_all": (df1, df5), "score_structural": (df2, df5), 
            "onehot_all": (df3, df5), "onehot_structural": (df4, df5)}

In [4]:
# 네 가지 데이터 셋(score_all, score_structural, onehot_all, onehot_structural)으로 랜덤포레스트 회귀 모델을 실행하는 코드
for name, (X, y) in datasets1.items():

    # 랜덤포레스트 회귀 모델 생성
    rf = RandomForestRegressor(n_estimators = 300, random_state = 42, n_jobs = -1)

    # 5-fold 교차검증 수행
    # scoring='neg_root_mean_squared_error'
    # sklearn은 큰 값이 좋은 점수라서 RMSE를 음수로 반환
    scores = cross_val_score(rf, X, y, scoring = 'neg_root_mean_squared_error', cv = 5)

    # 음수로 나온 값을 다시 양수 RMSE로 변환
    rmse = -scores.mean()

    # 실험 이름과 평균 RMSE 출력
    print(f"{name}: {rmse:.3f}")

score_all: 22.911
score_structural: 46.211
onehot_all: 23.201
onehot_structural: 46.334


In [5]:
df6 = pd.read_csv("../01_data/02_processed/sandbox/splits/X_train.csv")
df6.head()

Unnamed: 0,Agent_Age,Agent_Rating,Weather,Traffic,Distance,Pickup_Wait_Time,Vehicle_scooter,Vehicle_van,Area_Metropolitian,Area_Other,...,Category_Home,Category_Jewelry,Category_Kitchen,Category_Outdoors,Category_Pet Supplies,Category_Shoes,Category_Skincare,Category_Snacks,Category_Sports,Category_Toys
0,39,5.0,Sunny,Low,3.064486,5.0,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
1,38,4.1,Stormy,Medium,4.538408,10.0,True,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,26,4.9,Sunny,Low,17.297866,10.0,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
3,20,4.8,Stormy,Low,13.611172,5.0,False,True,True,False,...,False,True,False,False,False,False,False,False,False,False
4,29,4.7,Cloudy,Low,10.707345,5.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# 날씨 변수 원-핫 인코딩
weather_dummies = pd.get_dummies(df6["Weather"], prefix="Weather")
weather_dummies = weather_dummies.drop(columns=["Weather_Sunny"])
df6 = pd.concat([df6, weather_dummies], axis = 1)

# 교통 상황 변수 점수화
traffic_map = {"Low":1, "Medium":2, "High":3, "Jam":4}
df6["Traffic_score"] = df6["Traffic"].map(traffic_map)

# 원본 열 제거
df6 = df6.drop(["Traffic", "Weather"], axis=1)
df6.head()

Unnamed: 0,Agent_Age,Agent_Rating,Distance,Pickup_Wait_Time,Vehicle_scooter,Vehicle_van,Area_Metropolitian,Area_Other,Area_Semi_Urban,Category_Apparel,...,Category_Skincare,Category_Snacks,Category_Sports,Category_Toys,Weather_Cloudy,Weather_Fog,Weather_Sandstorms,Weather_Stormy,Weather_Windy,Traffic_score
0,39,5.0,3.064486,5.0,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,38,4.1,4.538408,10.0,True,False,True,False,False,False,...,False,True,False,False,False,False,False,True,False,2
2,26,4.9,17.297866,10.0,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3,20,4.8,13.611172,5.0,False,True,True,False,False,False,...,False,False,False,False,False,False,False,True,False,1
4,29,4.7,10.707345,5.0,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,1


In [7]:
# 여러 데이터셋을 딕셔너리로 정리
datasets2 = {"weather_score": (df1, df5), "weather_onehot": (df6, df5)}

# 두 가지 데이터 셋(weather_score, weather_onehot)으로 랜덤포레스트 회귀 모델을 실행하는 코드
for name, (X, y) in datasets2.items():
    rf = RandomForestRegressor(n_estimators = 300, random_state = 42, n_jobs = -1)
    scores = cross_val_score(rf, X, y, scoring = 'neg_root_mean_squared_error', cv = 5)
    rmse = -scores.mean()
    print(f"{name}: {rmse:.3f}")

weather_score: 22.911
weather_onehot: 23.043


In [8]:
# 최종 저장을 하기 위한 데이터 셋 불러오기
df7 = pd.read_csv("../01_data/02_processed/sandbox/variants/X_test_score_all.csv")
df8 = pd.read_csv("../01_data/02_processed/sandbox/splits/y_test.csv")

In [9]:
# 최종 확정된 데이터 셋들을 production 폴더에 저장
df1.to_csv("../01_data/02_processed/production/X_train_nonlinear.csv", index=False, encoding="utf-8-sig")
df5.to_csv("../01_data/02_processed/production/y_train.csv", index=False, encoding="utf-8-sig")
df7.to_csv("../01_data/02_processed/production/X_test_nonlinear.csv", index=False, encoding="utf-8-sig")
df8.to_csv("../01_data/02_processed/production/y_test.csv", index=False, encoding="utf-8-sig")