In [22]:
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [23]:
# 1. 數據加載與處理
def load_and_preprocess_data(file_path):
    # 讀取資料
    df = pd.read_csv(file_path)

    # 民國年轉西元年
    def convert_date(x):
        parts = x.split('/')
        year = int(parts[0]) + 1911  # 民國年轉換
        return f"{year}-{parts[1]}-{parts[2]}"

    df['日期'] = df['日期'].apply(convert_date)
    df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d')

    # 添加時間特徵
    df['天數'] = (df['日期'] - df['日期'].min()).dt.days
    df['月'] = df['日期'].dt.month
    df['日'] = df['日期'].dt.day
    df['星期'] = df['日期'].dt.weekday

    # 添加滯後特徵
    for i in range(1, 8):
        df[f'滯後{i}日平均價'] = df['平均價'].shift(i)

    # 創建目標變量: 7天後的平均價
    df['7日後平均價'] = df['平均價'].shift(-7)

    # 去除無效數據
    df = df.dropna()

    return df

In [24]:
# 2. 訓練與驗證
def train_and_evaluate_model(df):
    # 分割數據集
    train_data = df[df['日期'] < datetime(2022, 1, 1)]
    val_data = df[(df['日期'] >= datetime(2022, 1, 1)) & (df['日期'] < datetime(2024, 1, 1))]
    test_data = df[df['日期'] >= datetime(2024, 1, 1)]

    # 特徵與目標
    features = ['天數', '月', '日', '星期'] + [f'滯後{i}日平均價' for i in range(1, 8)]
    X_train, y_train = train_data[features], train_data['7日後平均價']
    X_val, y_val = val_data[features], val_data['7日後平均價']
    X_test = test_data[features]

    # 訓練 Random Forest 模型
    model = XGBRegressor(
        n_estimators=100,        # 樹的數量
        max_depth=6,             # 樹的最大深度
        learning_rate=0.1,       # 學習率
        subsample=0.8,           # 每棵樹使用的數據比例
        colsample_bytree=0.8,    # 每棵樹使用的特徵比例
        gamma=0,                 # 節點分裂的損失減少要求
        reg_alpha=0,             # L1正則化
        reg_lambda=1,            # L2正則化
        random_state=42          # 隨機種子
    )
    model.fit(X_train, y_train)

    # 驗證模型
    y_val_pred = model.predict(X_val)
    val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"Validation RMSE: {val_rmse}")

    return model, X_test, test_data

In [25]:
# 3. 預測與輸出
def predict_and_save_results(model, X_test, test_data, output_path):
    # 預測測試集
    test_data['預測價'] = model.predict(X_test)

     # 檢查是否有真實值可用於計算 RMSE
    if '7日後平均價' in test_data.columns:
        # 計算 RMSE
        rmse = mean_squared_error(test_data['7日後平均價'], test_data['預測價'], squared=False)
        print(f"Test RMSE: {rmse}")
    else:
        print("Test dataset does not contain true values for RMSE calculation.")

    # 保存結果
    test_data[['日期', '預測價']].to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")

In [26]:
# 主程式
file_path = "data/甘藍初秋_台北一日交易行情.csv"
output_path = "output_XGBoost.csv"

# 數據加載與處理
df = load_and_preprocess_data(file_path)

# 訓練與驗證
model, X_test, test_data = train_and_evaluate_model(df)

# 預測與保存結果
predict_and_save_results(model, X_test, test_data, output_path)

Validation RMSE: 8.953194425703565
Test RMSE: 11.081864595657766
Predictions saved to output_XGBoost.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['預測價'] = model.predict(X_test)
