In [None]:
# standard scaler 
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# 경로 설정
csv_folders = [
    Path("C:/Users/Developer/TCLab/csv/data_back3"),
    Path("C:/Users/Developer/TCLab/csv/data_back4"),
    Path("C:/Users/Developer/TCLab/csv/data_back5"),
    Path("C:/Users/Developer/TCLab/csv/data_back6"),
]

all_observations = []
all_actions = []
all_next_observations = []
raw_rewards = []
all_dones = []

alpha = 0.05  # 시간 패널티 계수
dt = 5.0      # 샘플 간격 (초)

for folder in csv_folders:
    csv_files = sorted(folder.glob("mpc_episode_*_data.csv"))
    for file in csv_files:
        df = pd.read_csv(file)
        for i in range(len(df) - 1):
            curr = df.iloc[i]
            next_ = df.iloc[i + 1]

            state = [curr["T1"], curr["T2"], curr["TSP1"], curr["TSP2"]]
            action = [curr["Q1"], curr["Q2"]]
            next_state = [next_["T1"], next_["T2"], next_["TSP1"], next_["TSP2"]]

            # 에러 및 시간
            # error1 = next_["TSP1"] - next_["T1"]
            # error2 = next_["TSP2"] - next_["T2"]
            # time_sec = i * dt

            # # 시간 기반 리워드
            # reward = -np.sqrt(error1**2 + error2**2) - alpha * time_sec
            error1 = curr["TSP1"] - curr["T1"]
            error2 = curr["TSP2"] - curr["T2"]
            reward = - np.sqrt(error1**2 + error2 ** 2)
            #reward = -np.sqrt(error1**2+ error2**2) 
            raw_rewards.append(reward)

            done = (i == len(df) - 2)
            

            all_observations.append(state)
            all_actions.append(action)
            all_next_observations.append(next_state)
            all_dones.append(done)

# ⭐️ 리워드 표준화
scaler = StandardScaler()
scaled_rewards = scaler.fit_transform(np.array(raw_rewards).reshape(-1, 1)).flatten()
import joblib
joblib.dump(scaler, "first_reward.pkl")
# 데이터셋 저장
dataset = {
    "observations": np.array(all_observations, dtype=np.float32),
    "actions": np.array(all_actions, dtype=np.float32),
    "next_observations": np.array(all_next_observations, dtype=np.float32),
    "rewards": scaled_rewards.astype(np.float32),
    "terminals": np.array(all_dones, dtype=bool),
}
# 추가 디버깅 출력
acts = dataset['actions']
print("[Debug] Action range in dataset:", acts.min(), "~", acts.max())
print("[Debug] Action mean:", acts.mean(axis=0), "std:", acts.std(axis=0))

rewards = scaled_rewards['rewards']
print("[Debug] rewards range in dataset:", scaled_rewards.min(), "~", scaled_rewards.max())
print("[Debug] rewards mean:", scaled_rewards.mean(axis=0), "std:", scaled_rewards.std(axis=0))

observations = dataset['observations']
print("[Debug] observations range in dataset:", observations.min(), "~", observations.max())
print("[Debug] observations mean:", observations.mean(axis=0), "std:", observations.std(axis=0))
# output_path = "C:/Users/Developer/TCLab/Data/MPC/first_reward.npz"
# np.savez(output_path, **dataset)
# print(f"✅ 저장 완료: {output_path}")
