In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df_origin_train = pd.read_csv("../../data/delivery_eta_train.csv")
df_train = df_origin_train.copy()
df_origin_train.head()

In [None]:
df_origin_test = pd.read_csv("../../data/delivery_eta_test.csv")
df_test = df_origin_test.copy()
df_test.head()

In [None]:
df_train.info()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

cols = ["Weather", "Time_Slot", "Vehicle", "Road_Type"]
encoder = OrdinalEncoder()

df_train[cols] = encoder.fit_transform(df_train[cols])
df_test[cols] = encoder.transform(df_test[cols])

corr_matrix = df_train.corr()

In [None]:
plt.figure(figsize=(20, 18))
sns.heatmap(corr_matrix, annot=True, cmap="RdBu", fmt=".2f")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
target_corr = corr_matrix["Delivery_Time_min"]

significant_features = target_corr[target_corr.abs() >= 0.1].sort_values(
    ascending=False
)

features = significant_features.index.tolist()

print("상관계수 절대값 0.1 이상인 주요 변수:")
print(significant_features)

plt.figure(figsize=(6, 8))
sns.heatmap(significant_features.to_frame(), annot=True, cmap="RdBu", fmt=".2f")
plt.title("Significant Features (|r| >= 0.1)")
plt.show()

In [None]:
features = significant_features.index.tolist()

sns.pairplot(
    data=df_train[features],
    hue="Delivery_Time_min",
    plot_kws={"alpha": 0.4},
)

In [None]:
# 수치형 변수 (산점도가 적합)
num_features = ["Distance_km", "Prep_Time_min", "Rain_mm"]
# 범주형 변수 (박스플롯이 적합)
cat_features = ["Weather", "Rush_Hour"]

# # 2. 수치형 변수 시각화 (산점도 위주)
for col in num_features:
    plt.figure(figsize=(10, 5))
    sns.regplot(data=df_train, x=col, y="Delivery_Time_min", scatter_kws={"alpha": 0.2})
    plt.title(f"Numerical: {col} vs Delivery Time")
    plt.show()

# 연속형 변수들에 대해 2D KDE 실행
for col in num_features:
    plt.figure(figsize=(10, 6))
    sns.kdeplot(
        data=df_train,
        x=col,
        y="Delivery_Time_min",
        fill=True,  # 색상 채우기
        thresh=0.05,  # 하위 밀도 제거 (깔끔하게 보임)
        levels=10,  # 등고선 단계
        cmap="Blues",
    )
    plt.title(f"Density Analysis: {col} vs Delivery Time")
    plt.show()

for col in cat_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df_train, x=col, y="Delivery_Time_min")
    plt.title(f"Categorical: {col} Distribution")
    plt.show()

In [None]:
features.remove("Delivery_Time_min")
X_train = df_train[features]
y_train = df_train["Delivery_Time_min"]

X_test = df_test[features]
y_test = df_test["Delivery_Time_min"]

In [None]:
from sklearn.linear_model import LinearRegression

from lib.vizkit import evaluate_regression_model

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
pred = linear_model.predict(X_test)

evaluate_regression_model(y_test, pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor

from lib.vizkit import evaluate_regression_model

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,  # 너무 깊으면 과적합되니 10 정도로 제한
    min_samples_leaf=5,  # 한 잎사귀에 최소 5명은 있게 해서 일반화
    random_state=3333,
)
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)

evaluate_regression_model(y_test, pred)

In [None]:
from xgboost import XGBRegressor

from lib.vizkit import evaluate_regression_model

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=3333)
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_test)

evaluate_regression_model(y_test, pred)

In [None]:
from lib.vizkit import show_importances

show_importances(xgb_model, features)


In [None]:
from lib.vizkit import best_model, compare_models

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=3333),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=3333),
}

best_pred = compare_models(models, X_train, y_train, X_test, y_test)
best_model(y_test, best_pred)
