In [1]:
import gurobipy as gp
from gurobipy import GRB
import gurobipy_pandas as gppd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gurobipy import min_, max_
from scipy.stats import multivariate_normal, norm
import pickle
import os
import glob
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt
import itertools
import math
import seaborn as sns


params = {
    "WLSACCESSID": "73a6e3bf-2a9d-41e8-85eb-dd9b9eda802b",
    "WLSSECRET": "c394298a-96ea-4c8c-9d5e-ef2bd5032427",
    "LICENSEID": 2563044,
}

env = gp.Env(params=params)
model = gp.Model(env=env)

Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2563044
Academic license 2563044 - for non-commercial use only - registered to 11___@g.nccu.edu.tw


# Settings


In [2]:
#####################

salvage_value = 0
cost = 300
price = 1000
holding_cost = 0

model_prefix = f"med_with_holding_cost_{holding_cost}"

#####################

CHUNK_SIZE = 100
data_size = CHUNK_SIZE * 3
train_size = 0.5
testing_size = 0.5

T = 10
service_level = 0.95  # 服務水準
M = 5000000
LASSO_BETA = 100

ASSIGNED_FS = np.arange(0.1, 1.0, 0.1)
ASSIGNED_TS = list(range(2, T))  # 2 到 T-1

np.random.seed(0)

# Gurobi Model Constants
THREADS = 12
TIME_LIMIT = 20000
MIPGAP = 0.01
CURRENT_TIMESTAMP = int(datetime.now().strftime("%Y%m%d%H%M"))

# Utils


## Models' Parameters

In [480]:
def save_model_parameters(
    name: str,
    alpha_values=None,
    beta_values=None,
    f_values=None,
    tau_values=None,
    data_size=data_size,
    current_timestamp=CURRENT_TIMESTAMP,
):
    os.makedirs("models", exist_ok=True)

    params = {}
    if alpha_values is not None:
        params["alpha"] = alpha_values
    if beta_values is not None:
        params["beta"] = beta_values
    if f_values is not None:
        params["f_values"] = f_values
    if tau_values is not None:
        params["tau_values"] = tau_values

    # 如果有參數才進行保存
    if params:
        with open(f"models/{name}_{data_size}_{current_timestamp}.pkl", "wb") as f:
            pickle.dump(params, f)
        print(
            f"Model parameters saved as models/{name}_{data_size}_{current_timestamp}.pkl"
        )
    else:
        print("No parameters provided to save.")

In [481]:
def delete_model_parameters(name: str, data_size: int):
    # 構建檔案的路徑
    file_path = f"models/{name}_{data_size}_{CURRENT_TIMESTAMP}.pkl"

    # 檢查檔案是否存在
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Model parameters file '{file_path}' has been deleted.")
    else:
        print(f"File '{file_path}' does not exist.")

In [482]:
def show_models(model_prefix):
    file_paths = sorted(glob.glob(f"models/{model_prefix}_*.pkl"))

    # 逐一讀取並打印每個檔案的內容
    for file_path in file_paths:
        with open(file_path, "rb") as f:
            params = pickle.load(f)
            print(f"Contents of {file_path}:")
            print(params)
            print()  # 空行分隔每個檔案的內容

In [485]:
def plot_strategies_profits_scatter(save_type, dfs: dict):
    names = list(dfs.keys())
    df_list = [dfs[name] for name in names]

    if len(df_list) <= 1:
        print("No dataframes to plot.")
        return

    pairs = list(itertools.combinations(range(len(df_list)), 2))
    num_pairs = len(pairs)
    grid_size = math.ceil(math.sqrt(num_pairs))
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))
    fig.suptitle("Scatter Plots of Profits (Matrix View)")

    for idx, (i, j) in enumerate(pairs):
        row, col = divmod(idx, grid_size)
        df_i, df_j = df_list[i], df_list[j]

        if df_i is None or df_j is None or df_i.empty or df_j.empty:
            continue
        if len(df_i) != len(df_j):
            continue

        ax = axes[row, col]
        ax.scatter(df_i["profits"], df_j["profits"], alpha=0.6)
        ax.plot(
            [
                min(df_i["profits"].min(), df_j["profits"].min()),
                max(df_i["profits"].max(), df_j["profits"].max()),
            ],
            [
                min(df_i["profits"].min(), df_j["profits"].min()),
                max(df_i["profits"].max(), df_j["profits"].max()),
            ],
            "k--",
            linewidth=1,
        )
        ax.set_xlabel(names[i])
        ax.set_ylabel(names[j])
        ax.set_title(f"{names[i]} vs {names[j]}")

    # Remove empty subplots
    for idx in range(num_pairs, grid_size * grid_size):
        row, col = divmod(idx, grid_size)
        fig.delaxes(axes[row, col])

    plt.tight_layout(rect=[0, 0, 1, 0.95])

    os.makedirs("plots", exist_ok=True)
    save_path = f"plots/plot_strategies_profits_scatter_{save_type}.png"
    plt.savefig(save_path, bbox_inches="tight")
    print(f"Plot saved as {save_path}")

    plt.show()
    plt.close()

In [486]:
def plot_relative_profit_deviation(save_type, baseline_profit, max_profits):
    """
    繪製多個策略相對於基準的平均利潤偏差。

    :param baseline_profit: 基準利潤值
    :param max_profits: 各策略的最大利潤列表，包含 None 值或 -1 表示無效數據
    """
    print(f"Baseline is: {baseline_profit}")
    for i, profit in enumerate(max_profits):
        print(f"S{i+1}'s profit: {profit}")

    # 計算相對值
    ratios = {}
    for idx, max_profit in enumerate(max_profits, start=1):
        if max_profit is not None and max_profit != -1:
            if baseline_profit != 0:
                ratio = (max_profit - baseline_profit) / abs(baseline_profit)
                ratios[f"S{idx}"] = ratio
            else:
                # 基準利潤為零時，直接記錄增量
                ratio = max_profit
                ratios[f"S{idx}"] = ratio

    # 設置 y 軸範圍
    if ratios:
        y_min = min(ratios.values()) - 0.1
        y_max = max(ratios.values()) + 0.1
    else:
        y_min, y_max = -0.1, 0.1

    # 創建圖表顯示結果
    plt.figure(figsize=(12, 8))

    if ratios:
        bars = plt.bar(
            ratios.keys(), ratios.values(), color=plt.cm.tab10(range(len(ratios)))
        )

        # 在每個柱狀圖上標出數值
        for bar in bars:
            yval = bar.get_height()
            plt.text(
                bar.get_x() + bar.get_width() / 2,
                yval,
                f"{yval:.4f}",
                ha="center",
                va="bottom",
            )

    # 添加基準線，表示基準值（No Opt）
    plt.axhline(y=0, color="gray", linestyle="--", label="Baseline (No Opt)")

    # 設置圖表標題和軸標籤
    plt.title("Relative Avg Profit Deviation from Baseline (1)")
    plt.xlabel("Strategies")
    plt.ylabel("Deviation from Baseline (1)")
    plt.ylim(y_min, y_max)
    plt.legend()

    name = "plot_relative_profit_deviation"

    os.makedirs("plots", exist_ok=True)
    save_path = f"plots/{name}_{save_type}_{data_size}_{CURRENT_TIMESTAMP}.png"

    plt.savefig(save_path, format="png", bbox_inches="tight")
    print(f"Plot saved as {save_path}")

    # Show plot
    plt.show()
    plt.close()

In [487]:
def plot_relative_profit_comparison(
    save_type,
    train_baseline_profit,
    test_baseline_profit,
    test_max_profits,
    train_max_profits,
):

    # Calculate relative deviations from baseline for test and train data
    test_ratios, train_ratios = {}, {}
    for idx, (test_profit, train_profit) in enumerate(
        zip(test_max_profits, train_max_profits), start=1
    ):
        if test_profit is not None and test_profit != -1:
            if test_baseline_profit != 0:
                test_ratio = (test_profit - test_baseline_profit) / abs(
                    test_baseline_profit
                )  # Relative deviation
            else:
                test_ratio = test_profit  # Use profit directly if baseline is zero
            test_ratios[f"S{idx}"] = test_ratio

        if train_profit is not None and train_profit != -1:
            if train_baseline_profit != 0:
                train_ratio = (train_profit - train_baseline_profit) / abs(
                    train_baseline_profit
                )  # Relative deviation
            else:
                train_ratio = train_profit  # Use profit directly if baseline is zero
            train_ratios[f"S{idx}"] = train_ratio

    # Define the fixed range of the y-axis
    max_value = max(
        max(test_ratios.values(), default=0), max(train_ratios.values(), default=0)
    )
    y_max = min(max_value + 0.1, 1.0)  # Limit max y to 1.0
    y_min = -y_max  # Keep symmetric scaling

    # Ensure y-axis tick marks are at intervals of 0.05
    y_ticks = np.arange(y_min, y_max + 0.05, 0.05)  # Generate ticks

    # Create bar plot for relative profit deviation comparison
    plt.figure(figsize=(14, 8))
    bar_width = 0.35
    indices = np.arange(len(train_ratios))

    # Plot bars for train and test ratios, with train on the left for each pair
    train_bars = plt.bar(
        indices - bar_width / 2,
        train_ratios.values(),
        bar_width,
        label="Train Data",
        color="salmon",
    )
    test_bars = plt.bar(
        indices + bar_width / 2,
        test_ratios.values(),
        bar_width,
        label="Test Data",
        color="skyblue",
    )

    # Add baseline line
    plt.axhline(y=0, color="gray", linestyle="--", label="Baseline (No Opt)")

    # Add labels for each bar
    for bar in train_bars:
        yval = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            yval,
            f"{yval:.2f}",
            ha="center",
            va="bottom",
        )
    for bar in test_bars:
        yval = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            yval,
            f"{yval:.2f}",
            ha="center",
            va="bottom",
        )

    # Set plot labels and title
    plt.xlabel("Strategies")
    plt.ylabel("Deviation from Baseline")
    plt.title("Relative Profit Deviation Comparison between Train and Test Data")
    plt.xticks(indices, train_ratios.keys())

    # Set fixed y-axis range and ticks
    plt.ylim(y_min, y_max)
    plt.yticks(y_ticks)  # Apply fixed 0.05 intervals

    plt.legend()

    name = "plot_relative_profit_comparison"

    os.makedirs("plots", exist_ok=True)
    save_path = f"plots/{name}_{save_type}.png"

    plt.savefig(save_path, format="png", bbox_inches="tight")
    print(f"Plot saved as {save_path}")

    # Show plot
    plt.show()
    plt.close()

In [488]:
def plot_Q0_Q1_distribution(save_type, stimulation_results_dfs):

    for idx, df in enumerate(stimulation_results_dfs, start=1):
        if df is None or len(df) == 0:
            continue

        df["Q0"] = pd.to_numeric(df["Q0"], errors="coerce")
        df["Q1"] = pd.to_numeric(df["Q1"], errors="coerce")
        df.dropna(subset=["Q0", "Q1"], inplace=True)

        plt.figure(figsize=(10, 6))
        plt.hist(df["Q0"], bins=20, alpha=0.6, label="Q0", edgecolor="black")
        plt.hist(df["Q1"], bins=20, alpha=0.6, label="Q1", edgecolor="black")
        plt.title(f"Histogram of Q0 and Q1 for stimulation_results_df_{idx}")
        plt.xlabel("Value")
        plt.ylabel("Count")
        plt.legend()
        plt.grid(True)

        name = "plot_Q0_Q1_distribution"

        os.makedirs("plots", exist_ok=True)
        save_path = (
            f"plots/{name}_{save_type}_{data_size}_S{idx}_{CURRENT_TIMESTAMP}.png"
        )

        plt.savefig(save_path, format="png", bbox_inches="tight")
        print(f"Plot saved as {save_path}")

        plt.show()

In [None]:


def plot_profits_deviation_box_plot(
    save_type, stimulation_results_dfs, baseline_avg_profits
):

    for idx, df in enumerate(stimulation_results_dfs, start=1):
        if df is not None and "profits" in df.columns:
            df["profits"] = pd.to_numeric(df["profits"], errors="coerce")
            df.dropna(subset=["profits"], inplace=True)

            # Calculate deviation
            df["Deviation"] = df["profits"] - baseline_avg_profits

            # Plot deviation as a boxplot
            plt.figure(figsize=(8, 6))
            sns.boxplot(y=df["Deviation"])
            plt.axhline(0, color="red", linestyle="--", label="Baseline")
            plt.title(
                f"Boxplot of Deviation of Profits from Baseline for stimulation_results_df_{idx}"
            )
            plt.ylabel("Deviation")
            plt.legend()
            plt.grid(True, axis="y")

            name = "plot_profits_deviation_box_plot"

            os.makedirs("plots", exist_ok=True)
            save_path = (
                f"plots/{name}_{save_type}_{data_size}_S{idx}_{CURRENT_TIMESTAMP}.png"
            )

            plt.savefig(save_path, format="png", bbox_inches="tight")
            print(f"Plot saved as {save_path}")

            plt.show()
        else:
            print(f"Skipping stimulation_results_df_{idx}: Missing 'profits' column.")

## Others

In [490]:
# Function to replace negative values with 0
def replace_negative_with_zero(df):
    return df.applymap(lambda x: max(x, 0))

In [491]:
def check_values(
    Q1_vars,
    Q_hat_adjusteds,
    Q0_vars,
    Sold_0s,
    total_demand_up_to_k_minus_1_vars,
    Sold_1s,
    total_demand_from_k_to_T_vars,
    Q1_plus_lefts,
    Left_0s,
    Lost_0s,
    Left_1s,
    Lost_1s,
):

    # 用於存儲每個條件的統計結果
    results = {
        "Condition": [],
        "Average_Error_Percentage": [],
        "Max_Error_Percentage": [],
        "Min_Error_Percentage": [],
        "Max_Error": [],
        "Min_Error": [],
    }

    # 定義存儲每個條件下的誤差和誤差百分比
    conditions_errors = {
        "Q1_vars": [],
        "Sold_0s": [],
        "Sold_1s": [],
        "Left_0s": [],
        "Left_1s": [],
        "Lost_0s": [],
        "Lost_1s": [],
    }

    # 存儲每個條件下的誤差百分比
    conditions_error_percentage = {
        "Q1_vars": [],
        "Sold_0s": [],
        "Sold_1s": [],
        "Left_0s": [],
        "Left_1s": [],
        "Lost_0s": [],
        "Lost_1s": [],
    }

    # 遍歷每一個變量集合
    for i in range(len(Q1_vars)):
        # 提取變量的值
        Q1 = Q1_vars[i].X
        Q_hat_adjusted = Q_hat_adjusteds[i].X
        Q0 = Q0_vars[i].X
        Sold_0 = Sold_0s[i].X
        total_demand_up_to_k_minus_1 = total_demand_up_to_k_minus_1_vars[i].X
        Sold_1 = Sold_1s[i].X
        total_demand_from_k_to_T = total_demand_from_k_to_T_vars[i].X
        Q1_plus_left = Q1_plus_lefts[i].X
        Left_0 = Left_0s[i].X
        Lost_0 = Lost_0s[i].X
        Left_1 = Left_1s[i].X
        Lost_1 = Lost_1s[i].X

        # 計算理論值
        theoretical_sold_0 = min(total_demand_up_to_k_minus_1, Q0)
        theoretical_left_0 = max(Q0 - theoretical_sold_0, 0)
        theoretical_Q1_plus_left = Q1 + theoretical_left_0  # Q1_plus_left 的理論值
        theoretical_sold_1 = min(total_demand_from_k_to_T, theoretical_Q1_plus_left)
        theoretical_left_1 = max(theoretical_Q1_plus_left - theoretical_sold_1, 0)
        theoretical_lost_0 = max(total_demand_up_to_k_minus_1 - Q0, 0)
        theoretical_lost_1 = max(total_demand_from_k_to_T - theoretical_Q1_plus_left, 0)

        # 檢查條件 2：Sold_0 一定等於理論值
        if not (Sold_0 == theoretical_sold_0):
            error = abs(Sold_0 - theoretical_sold_0)
            conditions_errors["Sold_0s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Sold_0s"].append(
                (error / theoretical_sold_0) * 100 if theoretical_sold_0 != 0 else 0
            )

        # 檢查條件 3：Sold_1 一定等於理論值
        if not (Sold_1 == theoretical_sold_1):
            error = abs(Sold_1 - theoretical_sold_1)
            conditions_errors["Sold_1s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Sold_1s"].append(
                (error / theoretical_sold_1) * 100 if theoretical_sold_1 != 0 else 0
            )

        # 檢查條件 4：Left_0 一定等於理論值
        if not (Left_0 == theoretical_left_0):
            error = abs(Left_0 - theoretical_left_0)
            conditions_errors["Left_0s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Left_0s"].append(
                (error / theoretical_left_0) * 100 if theoretical_left_0 != 0 else 0
            )

        # 檢查條件 5：Left_1 一定等於理論值
        if not (Left_1 == theoretical_left_1):
            error = abs(Left_1 - theoretical_left_1)
            conditions_errors["Left_1s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Left_1s"].append(
                (error / theoretical_left_1) * 100 if theoretical_left_1 != 0 else 0
            )

        # 檢查條件 6：Lost_0 一定等於理論值
        if not (Lost_0 == theoretical_lost_0):
            error = abs(Lost_0 - theoretical_lost_0)
            conditions_errors["Lost_0s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Lost_0s"].append(
                (error / theoretical_lost_0) * 100 if theoretical_lost_0 != 0 else 0
            )

        # 檢查條件 7：Lost_1 一定等於理論值
        if not (Lost_1 == theoretical_lost_1):
            error = abs(Lost_1 - theoretical_lost_1)
            conditions_errors["Lost_1s"].append(error)
            # 計算誤差百分比
            conditions_error_percentage["Lost_1s"].append(
                (error / theoretical_lost_1) * 100 if theoretical_lost_1 != 0 else 0
            )

    # 計算每個條件的統計結果
    for condition, errors in conditions_errors.items():
        error_percentages = conditions_error_percentage[condition]
        if errors:
            # 統計數據，並將所有數值四捨五入至小數點后三位
            avg_error_percentage = (
                round(sum(error_percentages) / len(error_percentages), 3)
                if error_percentages
                else 0.0
            )
            max_error_percentage = (
                round(max(error_percentages), 3) if error_percentages else 0.0
            )
            min_error_percentage = (
                round(min(error_percentages), 3) if error_percentages else 0.0
            )
            max_error = round(max(errors), 3) if errors else 0.0
            min_error = round(min(errors), 3) if errors else 0.0

            # 存儲結果
            results["Condition"].append(condition)
            results["Average_Error_Percentage"].append(avg_error_percentage)
            results["Max_Error_Percentage"].append(max_error_percentage)
            results["Min_Error_Percentage"].append(min_error_percentage)
            results["Max_Error"].append(max_error)
            results["Min_Error"].append(min_error)

    # 轉換為 DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [492]:
# Calculate service level
def calculate_service_level(*, salvage_value, cost, price):

    cu = price - cost
    co = cost - salvage_value
    service_lv = cu / (co + cu)

    return service_lv

In [493]:
def make_s3_related_strtegies_result(
    *,
    all_Rs,
    losses,
    lefts,
    profits,
    operation_profits,
    alpha_values,
    beta_values,
    F_vars,
    Q0_vars,
    Q1_vars,
    f_values,
    tau_values,
    holding_costs_0s,
    holding_costs_1s,
    all_left0s,
    all_left1s,
    all_lost0s,
    all_lost1s,
    gamma_values=None
):

    results_dict = {
        "average_profits": [sum(profits) / len(profits) if profits else 0],
        "average_losses": [sum(losses) / len(losses) if losses else 0],
        "average_lefts": [sum(lefts) / len(lefts) if lefts else 0],
        "average_operation_profits": [
            sum(operation_profits) / len(operation_profits) if operation_profits else 0
        ],
        "alpha_values": [alpha_values],
        "beta_values": [beta_values],
        "tau_values": [tau_values],
        "gamma_values": [gamma_values],
    }
    stimulations_result = {
        "R(T)": all_Rs,
        "R": [x - 2 for x in all_Rs],
        "F": F_vars,
        "f_values": f_values,
        "profits": profits,
        "losses": losses,
        "lefts": lefts,
        "operation_profits": operation_profits,
        "Q0": Q0_vars,
        "Q1": Q1_vars,
        "hc0": holding_costs_0s,
        "hc1": holding_costs_1s,
        "Left0s": all_left0s,
        "Left1s": all_left1s,
        "lost0s": all_lost0s,
        "lost1s": all_lost1s,
    }

    return pd.DataFrame(results_dict).sort_values(
        by="average_profits", ascending=False
    ), pd.DataFrame(stimulations_result)

# Analysis

In [3]:
# 列出所有 csv
import os

# 目錄路徑
directory = "/Users/hanyuan/Github/Two-Phase-Newsvendor/results"

# 列出所有 .csv 檔案
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# 顯示結果
for file in csv_files:
    print(file)

test_chunk100_lasso10_fold10.csv
test_chunk70_lasso10_fold10.csv
train_chunk50_lasso100_fold10.csv
test_chunk50_lasso10_fold10.csv
test_chunk70_lasso100_fold10.csv
test_chunk100_lasso100_fold3.csv
test_chunk70_lasso10_fold3.csv
test_chunk70_lasso1000_fold5.csv
test_chunk50_lasso100_fold5.csv
test_chunk100_lasso10_fold3.csv
train_chunk50_lasso10_fold5.csv
test_chunk70_lasso100_fold5.csv
test_chunk70_lasso1000_fold10.csv
train_chunk50_lasso1000_fold5.csv
train_chunk70_lasso1000_fold10.csv
test_chunk70_lasso10_fold5.csv
test_chunk100_lasso100_fold5.csv
test_chunk70_lasso1000_fold3.csv
test_chunk100_lasso100_fold10.csv
test_chunk50_lasso100_fold3.csv
test_chunk100_lasso10_fold5.csv
train_chunk100_lasso100_fold10.csv
train_chunk50_lasso10_fold3.csv
test_chunk50_lasso1000_fold10.csv
test_chunk70_lasso100_fold3.csv
train_chunk50_lasso1000_fold10.csv
train_chunk50_lasso1000_fold3.csv
train_chunk70_lasso100_fold10.csv
train_chunk50_lasso100_fold3.csv
train_chunk100_lasso10_fold5.csv
test_chunk5

In [16]:
import os

# 資料夾路徑
directory = "/Users/hanyuan/Github/Two-Phase-Newsvendor/results_0327"

# 取得所有 .csv 檔案名稱
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# 用來存組合鍵
train_keys = set()
test_keys = set()

# 分析每個檔案
for filename in csv_files:
    if filename.startswith("train_"):
        key = filename.replace("train_", "")
        train_keys.add(key)
    elif filename.startswith("test_"):
        key = filename.replace("test_", "")
        test_keys.add(key)

# 找出只有在 train 有的組合
train_only = train_keys - test_keys

# 找出只有在 test 有的組合
test_only = test_keys - train_keys

# 找出同時有 train 和 test 的組合
matched = train_keys & test_keys

# 顯示結果
print("✅ Matched train-test pairs:")
for key in sorted(matched):
    print(f" - {key}")

print("\n❌ Only in train:")
for key in sorted(train_only):
    print(f" - {key}")

print("\n❌ Only in test:")
for key in sorted(test_only):
    print(f" - {key}")

✅ Matched train-test pairs:
 - chunk100_lasso1_fold10.csv
 - chunk50_lasso1_fold10.csv
 - chunk70_lasso1_fold10.csv

❌ Only in train:

❌ Only in test:


In [28]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 設定 seaborn 樣式
sns.set(style="whitegrid")

# 🔧 參數設定
input_dir = "/Users/hanyuan/Github/Two-Phase-Newsvendor/results_0327"
output_dir = os.path.join(input_dir, "plots")
os.makedirs(output_dir, exist_ok=True)  # 如果 plots 資料夾不存在就建立

# 🔍 取得所有 train/test 檔案
files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
train_files = {f.replace("train_", ""): f for f in files if f.startswith("train_")}
test_files = {f.replace("test_", ""): f for f in files if f.startswith("test_")}

# 🔗 找出交集組合鍵
common_keys = sorted(set(train_files.keys()) & set(test_files.keys()))

print(f"Found {len(common_keys)} matched pairs.")

# 🎯 對每組配對做分析和畫圖
for key in common_keys:
    print(f"-----------------------------------")
    print(f"key: {key}")
    print(f"train_files[key]: {train_files[key]}")
    print(f"test_files[key]: {test_files[key]}")

    train_path = os.path.join(input_dir, train_files[key])
    test_path = os.path.join(input_dir, test_files[key])

    # 讀取 CSV 檔案
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_df = train_df.drop(columns=["S12", "S15"])

    # 計算平均
    train_means = train_df.mean()
    test_means = test_df.mean()

    # baseline 和 theory 最佳
    baseline_train = train_means["baseline"]
    baseline_test = test_means["baseline"]
    theory_best_train = train_means["S14"]
    theory_best_test = test_means["S14"]

    # 百分比變化
    train_pct_base = (train_means - baseline_train) / baseline_train * 100
    test_pct_base = (test_means - baseline_test) / baseline_test * 100
    train_pct_theory = (train_means - theory_best_train) / theory_best_train * 100
    test_pct_theory = (test_means - theory_best_test) / theory_best_test * 100

    # 整理成 DataFrame
    avg_df = pd.DataFrame(
        {
            "Method": train_means.index,
            "Train": train_means.values,
            "Test": test_means.values,
            "Train_%_Base": train_pct_base.values,
            "Test_%_Base": test_pct_base.values,
            "Train_%_Theory": train_pct_theory.values,
            "Test_%_Theory": test_pct_theory.values,
        }
    )

    # 長格式轉換
    avg_df_melted = avg_df.melt(
        id_vars=[
            "Method",
            "Train_%_Base",
            "Test_%_Base",
            "Train_%_Theory",
            "Test_%_Theory",
        ],
        value_vars=["Train", "Test"],
        var_name="Dataset",
        value_name="Average Profit",
    )

    # 畫圖
    plt.figure(figsize=(15, 9))
    ax = sns.barplot(x="Method", y="Average Profit", hue="Dataset", data=avg_df_melted)

    # 加上註解
    for patch, (method, ds) in zip(
        ax.patches, zip(avg_df_melted["Method"], avg_df_melted["Dataset"])
    ):
        if ds == "Train":
            pct_base = avg_df.loc[avg_df.Method == method, "Train_%_Base"].values[0]
            pct_theory = avg_df.loc[avg_df.Method == method, "Train_%_Theory"].values[0]
        else:
            pct_base = avg_df.loc[avg_df.Method == method, "Test_%_Base"].values[0]
            pct_theory = avg_df.loc[avg_df.Method == method, "Test_%_Theory"].values[0]

        ax.annotate(
            f"{pct_base:.1f}%\n({pct_theory:.1f}%)",
            (patch.get_x() + patch.get_width() / 2, patch.get_height()),
            ha="center",
            va="bottom",
            fontsize=9,
            xytext=(0, 5),
            textcoords="offset points",
        )

    plt.title(f"Average Profit: {key}\n% Change vs Baseline / Theory Best")
    plt.ylabel("Average Profit")
    plt.xlabel("Method")
    plt.xticks(rotation=30)
    plt.legend(title="Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()

    # 儲存圖片
    plot_filename = os.path.join(output_dir, f"avg_profit_{key.replace('.csv','')}.png")
    plt.savefig(plot_filename)
    plt.close()

    print(f"✅ Saved plot for: {key}")

print(f"\n🎉 All plots saved in: {output_dir}")

Found 11 matched pairs.
-----------------------------------
key: chunk100_lasso1_fold10.csv
train_files[key]: train_chunk100_lasso1_fold10.csv
test_files[key]: test_chunk100_lasso1_fold10.csv
✅ Saved plot for: chunk100_lasso1_fold10.csv
-----------------------------------
key: chunk200_lasso1_fold10.csv
train_files[key]: train_chunk200_lasso1_fold10.csv
test_files[key]: test_chunk200_lasso1_fold10.csv
✅ Saved plot for: chunk200_lasso1_fold10.csv
-----------------------------------
key: chunk20_lasso1_fold10.csv
train_files[key]: train_chunk20_lasso1_fold10.csv
test_files[key]: test_chunk20_lasso1_fold10.csv
✅ Saved plot for: chunk20_lasso1_fold10.csv
-----------------------------------
key: chunk300_lasso1_fold10.csv
train_files[key]: train_chunk300_lasso1_fold10.csv
test_files[key]: test_chunk300_lasso1_fold10.csv
✅ Saved plot for: chunk300_lasso1_fold10.csv
-----------------------------------
key: chunk400_lasso1_fold10.csv
train_files[key]: train_chunk400_lasso1_fold10.csv
test_file

In [29]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 設定 seaborn 樣式
sns.set(style="whitegrid")

# 🔧 參數設定
input_dir = "/Users/hanyuan/Github/Two-Phase-Newsvendor/results_0327"
output_dir = os.path.join(input_dir, "plots")
os.makedirs(output_dir, exist_ok=True)  # 如果 plots 資料夾不存在就建立

# 🔍 取得所有 train/test 檔案
files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
train_files = {f.replace("train_", ""): f for f in files if f.startswith("train_")}
test_files = {f.replace("test_", ""): f for f in files if f.startswith("test_")}

# 🔗 找出交集組合鍵
common_keys = sorted(set(train_files.keys()) & set(test_files.keys()))

print(f"Found {len(common_keys)} matched pairs.")

# 🎯 對每組配對做分析和畫圖
for key in common_keys:
    print(f"-----------------------------------")
    print(f"key: {key}")
    print(f"train_files[key]: {train_files[key]}")
    print(f"test_files[key]: {test_files[key]}")

    train_path = os.path.join(input_dir, train_files[key])
    test_path = os.path.join(input_dir, test_files[key])

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_df = train_df.drop(columns=["S12", "S15"])

    # ========== 1️⃣ 計算 Relative (%) vs Baseline ==========
    baseline_train = train_df["baseline"].to_numpy().reshape(-1, 1)
    baseline_test = test_df["baseline"].to_numpy().reshape(-1, 1)

    train_relative = (train_df.to_numpy() - baseline_train) / baseline_train * 100
    test_relative = (test_df.to_numpy() - baseline_test) / baseline_test * 100

    train_relative = pd.DataFrame(
        train_relative, columns=train_df.columns, index=train_df.index
    )
    test_relative = pd.DataFrame(
        test_relative, columns=test_df.columns, index=test_df.index
    )

    train_relative["Fold"] = train_relative.index + 1
    test_relative["Fold"] = test_relative.index + 1

    train_long = train_relative.melt(
        id_vars="Fold", var_name="Method", value_name="Relative Profit (%)"
    )
    test_long = test_relative.melt(
        id_vars="Fold", var_name="Method", value_name="Relative Profit (%)"
    )
    train_long["Dataset"] = "Train"
    test_long["Dataset"] = "Test"

    fold_long = pd.concat([train_long, test_long], axis=0)
    fold_long = fold_long.reset_index(drop=True)

    # 🎨 1. Line Plot
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=fold_long,
        x="Fold",
        y="Relative Profit (%)",
        hue="Method",
        style="Dataset",
        markers=True,
        dashes=False,
    )
    plt.axhline(0, color="gray", linestyle="--", linewidth=1)
    plt.title(f"[{key}] Strategy Performance Across Folds (Relative to Baseline)")
    plt.legend(title="Method & Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"line_{key.replace('.csv','')}.png"))
    plt.close()

    # 🎨 2. Box Plot
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=fold_long, x="Method", y="Relative Profit (%)", hue="Dataset")
    plt.axhline(0, color="gray", linestyle="--", linewidth=1)
    plt.title(f"[{key}] Strategy Performance Distribution Across Folds")
    plt.xticks(rotation=30)
    plt.legend(title="Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"box_{key.replace('.csv','')}.png"))
    plt.close()

    # ========== 2️⃣ Heatmap: Relative vs Theory ==========
    theory_best_train = train_df["S14"].to_numpy().reshape(-1, 1)
    theory_best_test = test_df["S14"].to_numpy().reshape(-1, 1)

    train_theory_rel = (
        (train_df.to_numpy() - theory_best_train) / theory_best_train * 100
    )
    test_theory_rel = (test_df.to_numpy() - theory_best_test) / theory_best_test * 100

    train_theory_df = pd.DataFrame(
        train_theory_rel, columns=train_df.columns, index=train_df.index
    )
    test_theory_df = pd.DataFrame(
        test_theory_rel, columns=test_df.columns, index=test_df.index
    )

    train_theory_df["Fold"] = train_theory_df.index + 1
    test_theory_df["Fold"] = test_theory_df.index + 1

    train_theory_long = train_theory_df.melt(
        id_vars="Fold", var_name="Method", value_name="Relative vs Theory (%)"
    )
    train_theory_long["Dataset"] = "Train"
    test_theory_long = test_theory_df.melt(
        id_vars="Fold", var_name="Method", value_name="Relative vs Theory (%)"
    )
    test_theory_long["Dataset"] = "Test"

    merged = fold_long.merge(
        pd.concat([train_theory_long, test_theory_long], axis=0),
        on=["Fold", "Method", "Dataset"],
    )

    heatmap_data = merged.pivot(
        index="Fold", columns=["Method", "Dataset"], values="Relative Profit (%)"
    )
    annot = merged.assign(
        annot=merged["Relative Profit (%)"].round(1).astype(str)
        + "\n("
        + merged["Relative vs Theory (%)"].round(1).astype(str)
        + "%)"
    ).pivot(index="Fold", columns=["Method", "Dataset"], values="annot")

    # 🎨 3. Heatmap
    plt.figure(figsize=(14, 8))
    sns.heatmap(heatmap_data, annot=annot, fmt="", cmap="coolwarm", linewidths=0.5)
    plt.title(
        f"[{key}] Relative Profit (%) Across Folds\n(vs Baseline / (vs Theory Best))"
    )
    plt.ylabel("Fold")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"heatmap_{key.replace('.csv','')}.png"))
    plt.close()

    print(f"✅ Plots saved for: {key}")

print(f"\n🎉 All plots saved to: {output_dir}")

Found 11 matched pairs.
-----------------------------------
key: chunk100_lasso1_fold10.csv
train_files[key]: train_chunk100_lasso1_fold10.csv
test_files[key]: test_chunk100_lasso1_fold10.csv
✅ Plots saved for: chunk100_lasso1_fold10.csv
-----------------------------------
key: chunk200_lasso1_fold10.csv
train_files[key]: train_chunk200_lasso1_fold10.csv
test_files[key]: test_chunk200_lasso1_fold10.csv
✅ Plots saved for: chunk200_lasso1_fold10.csv
-----------------------------------
key: chunk20_lasso1_fold10.csv
train_files[key]: train_chunk20_lasso1_fold10.csv
test_files[key]: test_chunk20_lasso1_fold10.csv
✅ Plots saved for: chunk20_lasso1_fold10.csv
-----------------------------------
key: chunk300_lasso1_fold10.csv
train_files[key]: train_chunk300_lasso1_fold10.csv
test_files[key]: test_chunk300_lasso1_fold10.csv
✅ Plots saved for: chunk300_lasso1_fold10.csv
-----------------------------------
key: chunk400_lasso1_fold10.csv
train_files[key]: train_chunk400_lasso1_fold10.csv
test_

In [None]:
# # 1️⃣ 計算平均 profit
# train_means = train_all_fold_profit_df.mean()
# test_means = test_all_fold_profit_df.mean()

# # 2️⃣ 定義 baseline & theory best
# baseline_train = train_means["baseline"]
# baseline_test = test_means["baseline"]
# theory_best_train = train_means["S14"]
# theory_best_test = test_means["S14"]

# # 3️⃣ 計算百分比變化：baseline & theory
# train_pct_base = (train_means - baseline_train) / baseline_train * 100
# test_pct_base = (test_means - baseline_test) / baseline_test * 100
# train_pct_theory = (train_means - theory_best_train) / theory_best_train * 100
# test_pct_theory = (test_means - theory_best_test) / theory_best_test * 100

# # 4️⃣ 建 DataFrame
# avg_df = pd.DataFrame(
#     {
#         "Method": train_means.index,
#         "Train": train_means.values,
#         "Test": test_means.values,
#         "Train_%_Base": train_pct_base.values,
#         "Test_%_Base": test_pct_base.values,
#         "Train_%_Theory": train_pct_theory.values,
#         "Test_%_Theory": test_pct_theory.values,
#     }
# )

# avg_df_melted = avg_df.melt(
#     id_vars=[
#         "Method",
#         "Train_%_Base",
#         "Test_%_Base",
#         "Train_%_Theory",
#         "Test_%_Theory",
#     ],
#     value_vars=["Train", "Test"],
#     var_name="Dataset",
#     value_name="Average Profit",
# )

# # 5️⃣ 畫圖
# plt.figure(figsize=(15, 9))
# ax = sns.barplot(x="Method", y="Average Profit", hue="Dataset", data=avg_df_melted)

# # 6️⃣ 標註：baseline (%) 在第一行、theory (%) 括號內第二行
# for patch, (method, ds) in zip(
#     ax.patches, zip(avg_df_melted["Method"], avg_df_melted["Dataset"])
# ):
#     if ds == "Train":
#         pct_base = avg_df.loc[avg_df.Method == method, "Train_%_Base"].values[0]
#         pct_theory = avg_df.loc[avg_df.Method == method, "Train_%_Theory"].values[0]
#     else:
#         pct_base = avg_df.loc[avg_df.Method == method, "Test_%_Base"].values[0]
#         pct_theory = avg_df.loc[avg_df.Method == method, "Test_%_Theory"].values[0]

#     ax.annotate(
#         f"{pct_base:.1f}%\n({pct_theory:.1f}%)",
#         (patch.get_x() + patch.get_width() / 2, patch.get_height()),
#         ha="center",
#         va="bottom",
#         fontsize=9,
#         xytext=(0, 5),
#         textcoords="offset points",
#     )

# plt.title("Average Profit (Train vs Test) — % Change vs Baseline / Theory Best")
# plt.ylabel("Average Profit")
# plt.xlabel("Method")
# plt.xticks(rotation=30)
# plt.legend(title="Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.tight_layout()
# plt.show()

In [2]:
# # 計算 baseline（訓練與測試）
# baseline_train = train_all_fold_profit_df["baseline"].to_numpy().reshape(-1, 1)
# baseline_test = test_all_fold_profit_df["baseline"].to_numpy().reshape(-1, 1)

# # 計算百分比變化
# train_relative = (
#     (train_all_fold_profit_df.to_numpy() - baseline_train) / baseline_train * 100
# )
# test_relative = (
#     (test_all_fold_profit_df.to_numpy() - baseline_test) / baseline_test * 100
# )

# # 轉回 DataFrame，並保留 column names
# train_relative = pd.DataFrame(
#     train_relative,
#     columns=train_all_fold_profit_df.columns,
#     index=train_all_fold_profit_df.index,
# )
# test_relative = pd.DataFrame(
#     test_relative,
#     columns=test_all_fold_profit_df.columns,
#     index=test_all_fold_profit_df.index,
# )

# # 加入 fold 編號
# train_relative["Fold"] = train_relative.index + 1
# test_relative["Fold"] = test_relative.index + 1

# # 轉換成長格式
# train_long = train_relative.melt(
#     id_vars="Fold", var_name="Method", value_name="Relative Profit (%)"
# )
# train_long["Dataset"] = "Train"

# test_long = test_relative.melt(
#     id_vars="Fold", var_name="Method", value_name="Relative Profit (%)"
# )
# test_long["Dataset"] = "Test"

# # 合併數據
# fold_long = pd.concat([train_long, test_long], axis=0)

# # === 1. 使用線圖 (Line Plot) 觀察不同 Fold 上的變化趨勢 ===
# plt.figure(figsize=(12, 6))
# sns.lineplot(
#     data=fold_long,
#     x="Fold",
#     y="Relative Profit (%)",
#     hue="Method",
#     style="Dataset",
#     markers=True,
#     dashes=False,
# )
# plt.axhline(0, color="gray", linestyle="--", linewidth=1)  # 基準線
# plt.title("Strategy Performance Across Folds (Relative to Baseline)")
# plt.legend(title="Method & Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.show()

# # === 2. 使用箱型圖 (Box Plot) 查看策略穩定性 ===
# plt.figure(figsize=(12, 6))
# sns.boxplot(data=fold_long, x="Method", y="Relative Profit (%)", hue="Dataset")
# plt.axhline(0, color="gray", linestyle="--", linewidth=1)
# plt.title("Strategy Performance Distribution Across Folds")
# plt.xticks(rotation=30)
# plt.legend(title="Dataset", bbox_to_anchor=(1.05, 1), loc="upper left")
# plt.show()

# # 3️⃣ Heatmap：同時顯示 vs Baseline & vs Theory Best（每 Fold 的 S14）
# theory_best_train = train_all_fold_profit_df["S14"].to_numpy().reshape(-1, 1)
# theory_best_test = test_all_fold_profit_df["S14"].to_numpy().reshape(-1, 1)

# # vs Theory (%) 計算
# train_theory_rel = (
#     (train_all_fold_profit_df.to_numpy() - theory_best_train) / theory_best_train * 100
# )
# test_theory_rel = (
#     (test_all_fold_profit_df.to_numpy() - theory_best_test) / theory_best_test * 100
# )

# # 回 DataFrame 並 melt
# train_theory_rel = pd.DataFrame(
#     train_theory_rel,
#     columns=train_all_fold_profit_df.columns,
#     index=train_all_fold_profit_df.index,
# )
# train_theory_rel["Fold"] = train_theory_rel.index + 1
# train_theory_long = train_theory_rel.melt(
#     id_vars="Fold", var_name="Method", value_name="Relative vs Theory (%)"
# )
# train_theory_long["Dataset"] = "Train"

# test_theory_rel = pd.DataFrame(
#     test_theory_rel,
#     columns=test_all_fold_profit_df.columns,
#     index=test_all_fold_profit_df.index,
# )
# test_theory_rel["Fold"] = test_theory_rel.index + 1
# test_theory_long = test_theory_rel.melt(
#     id_vars="Fold", var_name="Method", value_name="Relative vs Theory (%)"
# )
# test_theory_long["Dataset"] = "Test"

# # 合併 baseline (%) 與 theory (%) 資料
# merged = fold_long.merge(
#     pd.concat([train_theory_long, test_theory_long], axis=0),
#     on=["Fold", "Method", "Dataset"],
# )

# # Pivot heatmap values + annotations
# heatmap_data = merged.pivot(
#     index="Fold", columns=["Method", "Dataset"], values="Relative Profit (%)"
# )
# annot = merged.assign(
#     annot=merged["Relative Profit (%)"].round(1).astype(str)
#     + "\n("
#     + merged["Relative vs Theory (%)"].round(1).astype(str)
#     + "%)"
# ).pivot(index="Fold", columns=["Method", "Dataset"], values="annot")

# plt.figure(figsize=(14, 8))
# sns.heatmap(heatmap_data, annot=annot, fmt="", cmap="coolwarm", linewidths=0.5)
# plt.title("Relative Profit (%) Across Folds\n(vs Baseline / (vs Theory Best))")
# plt.ylabel("Fold")
# plt.tight_layout()
# plt.show()

In [3]:
# # 訓練階段分佈

# baseline_data = []
# S1_data = []
# S2_data = []
# S12_data = []
# S14_data = []
# S15_data = []

# for result in train_all_fold_stimulation_results:
#     baseline_data.append(result["baseline"])
#     S1_data.append(result["S1"])
#     S2_data.append(result["S2"])
#     S12_data.append(result["S12"])
#     S14_data.append(result["S14"])
#     S15_data.append(result["S15"])

# # 合併數據
# baseline_df = pd.concat(baseline_data, ignore_index=True)
# S1_df = pd.concat(S1_data, ignore_index=True)
# S2_df = pd.concat(S2_data, ignore_index=True)
# S12_df = pd.concat(S12_data, ignore_index=True)
# S14_df = pd.concat(S14_data, ignore_index=True)
# S15_df = pd.concat(S15_data, ignore_index=True)


# dfs = {
#     "baseline": baseline_df,
#     "S1": S1_df,
#     "S2": S2_df,
#     "S12": S12_df,
#     "S15": S15_df,
#     "S14": S14_df,
# }

# # 調用繪圖函數
# plot_strategies_profits_scatter(f"{status}_{model_prefix}", dfs)

In [4]:
# # 測試結果分布圖

# baseline_data = []
# S1_data = []
# S2_data = []
# S12_data = []
# S14_data = []
# S15_data = []

# for result in test_all_fold_stimulation_results:
#     baseline_data.append(result["baseline"])
#     S1_data.append(result["S1"])
#     S2_data.append(result["S2"])
#     S12_data.append(result["S12"])
#     S14_data.append(result["S14"])
#     S15_data.append(result["S15"])
# # 合併數據
# baseline_df = pd.concat(baseline_data, ignore_index=True)
# S1_df = pd.concat(S1_data, ignore_index=True)
# S2_df = pd.concat(S2_data, ignore_index=True)
# S12_df = pd.concat(S12_data, ignore_index=True)
# S14_df = pd.concat(S14_data, ignore_index=True)
# S15_df = pd.concat(S15_data, ignore_index=True)

# dfs = {
#     "baseline": baseline_df,
#     "S1": S1_df,
#     "S2": S2_df,
#     "S12": S12_df,
#     "S15": S15_df,
#     "S14": S14_df,
# }

# # 調用繪圖函數
# plot_strategies_profits_scatter(f"{status}_{model_prefix}", dfs)