In [1]:
import sys
import subprocess
import warnings

# ==============================================================================
# -1. 필수 라이브러리 설치 (실행 환경 초기화 대비)
# ==============================================================================
try:
    print("--- 필수 라이브러리 설치 시작 ---")
    python_executable = sys.executable
    subprocess.check_call([
        python_executable, "-m", "pip", "install",
        "d3rlpy[torch]", "pandas", "scikit-learn"
    ])
    print("✅ 라이브러리 설치 완료!")
except Exception as e:
    print(f"❌ 오류: 라이브러리 설치에 실패했습니다: {e}")
    sys.exit(1)

# Gym 관련 경고 숨기기 (기능 영향 없음)
warnings.filterwarnings("ignore", category=UserWarning, module="gym")

--- 필수 라이브러리 설치 시작 ---
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: http://repo.ai.gato/registry/repository/pypi-proxy/simple
Collecting gymnasium==1.0.0 (from d3rlpy[torch])
  Using cached http://repo.ai.gato/registry/repository/pypi-proxy/packages/gymnasium/1.0.0/gymnasium-1.0.0-py3-none-any.whl (958 kB)


[0m

Installing collected packages: gymnasium
Successfully installed gymnasium-1.0.0



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


✅ 라이브러리 설치 완료!


In [2]:
import pandas as pd
import numpy as np
import d3rlpy
import torch
from d3rlpy.dataset import MDPDataset
from d3rlpy.metrics import TDErrorEvaluator          # TD 오차 평가자
from d3rlpy.algos import DiscreteCQLConfig           # ✅ 이산 CQL
from d3rlpy.constants import ActionSpace             # ✅ 이산 액션 명시
from sklearn.model_selection import train_test_split
import ast
from pathlib import Path
import random

# 재현성
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


<torch._C.Generator at 0x7f53480c7930>

In [3]:
# ==============================================================================
# 0. 파일 경로 설정
# ==============================================================================
DATA_FILE = "rl_dataset.csv"
MODEL_SAVE_FILE = "cql_model.pt"

print("\n--- 최종 모듈: CQL(Discrete) 모델 학습 및 전략 시뮬레이션 시작 ---")


--- 최종 모듈: CQL(Discrete) 모델 학습 및 전략 시뮬레이션 시작 ---


In [4]:
# ==============================================================================
# 1. 데이터셋 불러오기 및 전처리
# ==============================================================================
try:
    df = pd.read_csv(DATA_FILE)
    print(f"✅ Step 1.1: '{DATA_FILE}' 로딩 성공! rows={len(df)}")
except FileNotFoundError:
    print(f"❌ 오류: '{DATA_FILE}' 파일을 찾을 수 없습니다.")
    sys.exit(1)

def safe_literal_eval(s):
    if isinstance(s, str):
        s = s.replace('nan', 'None').replace('inf', 'None').replace('-inf', 'None')
        try:
            return ast.literal_eval(s)
        except (ValueError, SyntaxError):
            return None
    return s

# 문자열 → 리스트/배열 복구
df['STATE'] = df['STATE'].apply(safe_literal_eval)
df['NEXT_STATE'] = df['NEXT_STATE'].apply(safe_literal_eval)

# 결측 제거
before = len(df)
df.dropna(subset=['STATE', 'NEXT_STATE', 'ACTION', 'REWARD'], inplace=True)
after = len(df)
print(f"✅ Step 1.2: 데이터 정제 완료! (삭제 {before - after}행, 잔여 {after}행)")

# 액션 정수 리맵핑: 0..N-1
unique_actions = np.sort(pd.unique(df['ACTION']))
action_to_idx = {a: i for i, a in enumerate(unique_actions)}
df['ACTION_IDX'] = df['ACTION'].map(action_to_idx).astype(np.int32)
num_actions = len(unique_actions)
print(f"ℹ️ 고유 액션: {list(unique_actions)} → 인덱스 매핑 0..{num_actions-1}")

# 보상/상태 타입
df['REWARD'] = df['REWARD'].astype(np.float32)

# 상태를 float32 ndarray로 보장
def to_f32_array(x):
    arr = np.array(x, dtype=np.float32)
    return arr

df['STATE'] = df['STATE'].apply(to_f32_array)
df['NEXT_STATE'] = df['NEXT_STATE'].apply(to_f32_array)

# 모든 상태 벡터 길이가 동일한지 확인
state_dims = {s.shape for s in df['STATE']}
if len(state_dims) != 1:
    raise ValueError(f"STATE 차원이 일치하지 않습니다: {state_dims}")
state_dim = list(state_dims)[0][0]
print(f"ℹ️ 상태 차원: {state_dim}, 액션 개수: {num_actions}")

✅ Step 1.1: 'rl_dataset.csv' 로딩 성공! rows=72313
✅ Step 1.2: 데이터 정제 완료! (삭제 0행, 잔여 72313행)
ℹ️ 고유 액션: [0, 1, 2, 3] → 인덱스 매핑 0..3
ℹ️ 상태 차원: 21, 액션 개수: 4


In [5]:
# ==============================================================================
# 2. d3rlpy 데이터셋 생성
# ==============================================================================
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False, random_state=SEED)
print("✅ Step 2.1: 데이터셋 분리 완료! "
      f"(train={len(train_df)}, test={len(test_df)})")

# 에피소드 경계 표시(단일 에피소드 가정: 마지막만 terminal=1.0)
train_terminals = np.zeros(len(train_df), dtype=np.float32)
train_terminals[-1] = 1.0
test_terminals = np.zeros(len(test_df), dtype=np.float32)
test_terminals[-1] = 1.0

train_obs = np.stack(train_df['STATE'].to_list()).astype(np.float32)
test_obs  = np.stack(test_df['STATE'].to_list()).astype(np.float32)

train_actions = train_df['ACTION_IDX'].values.astype(np.int32)
test_actions  = test_df['ACTION_IDX'].values.astype(np.int32)

train_rewards = train_df['REWARD'].values.astype(np.float32)
test_rewards  = test_df['REWARD'].values.astype(np.float32)

# ✅ 이산 액션 공간/크기 명시
train_dataset = MDPDataset(
    observations=train_obs,
    actions=train_actions,
    rewards=train_rewards,
    terminals=train_terminals,
    action_space=ActionSpace.DISCRETE,
    action_size=num_actions
)
print("✅ Step 2.2: 학습용 d3rlpy 데이터셋 생성 완료!")

test_dataset = MDPDataset(
    observations=test_obs,
    actions=test_actions,
    rewards=test_rewards,
    terminals=test_terminals,
    action_space=ActionSpace.DISCRETE,
    action_size=num_actions
)
print("✅ Step 2.3: 테스트용 d3rlpy 데이터셋 생성 완료!")

✅ Step 2.1: 데이터셋 분리 완료! (train=57850, test=14463)
[2m2025-10-01 10:34.34[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(21,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
✅ Step 2.2: 학습용 d3rlpy 데이터셋 생성 완료!
[2m2025-10-01 10:34.34[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(21,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
✅ Step 2.3: 테스트용 d3rlpy 데이터셋 생성 완료!


In [7]:
# ==============================================================================
# 3. CQL(Discrete) 모델 정의 및 학습
# ==============================================================================
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"\n--- 사용할 연산 장치: {device.upper()} ---")

# ✅ 이산 CQL 구성 (필요 시 하이퍼파라미터 조정)
cql = DiscreteCQLConfig().create(device=device)

# ✅ 최신 방식: Evaluator 객체 사용
td_evaluator = TDErrorEvaluator()  # episodes 미지정 → 학습 데이터셋 기준 평가

print("--- Step 3: CQL(Discrete) 모델 학습 시작 ---")
cql.fit(
    train_dataset,
    n_steps=10_000,
    n_steps_per_epoch=1_000,
    evaluators={'td_error': td_evaluator},
    with_timestamp=False
)  # ← random_state 제거
print("\n✅ Step 3: CQL 모델 학습 완료!")


--- 사용할 연산 장치: CUDA:0 ---
--- Step 3: CQL(Discrete) 모델 학습 시작 ---
[2m2025-10-01 10:35.50[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(21,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=4)[0m
[2m2025-10-01 10:35.50[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-10-01 10:35.51[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-10-01 10:35.51[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteCQL[0m
[2m2025-10-01 10:35.51[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [21], 'action_size': 4, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 32, 'gamma': 0.99, 'obse

Epoch 1/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.02[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=1 step=1000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000843367338180542, 'time_algorithm_update': 0.00927007532119751, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.010192923784255981, 'td_error': nan}[0m [36mstep[0m=[35m1000[0m
[2m2025-10-01 10:36.02[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_1000.d3[0m


Epoch 2/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.10[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=2 step=2000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006532819271087647, 'time_algorithm_update': 0.0063336389064788815, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.00705573296546936, 'td_error': nan}[0m [36mstep[0m=[35m2000[0m
[2m2025-10-01 10:36.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_2000.d3[0m


Epoch 3/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.17[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=3 step=3000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00061128568649292, 'time_algorithm_update': 0.0056067447662353515, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.006281739234924317, 'td_error': nan}[0m [36mstep[0m=[35m3000[0m
[2m2025-10-01 10:36.17[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_3000.d3[0m


Epoch 4/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.26[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=4 step=4000[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006609346866607666, 'time_algorithm_update': 0.007103270769119262, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.007817338943481446, 'td_error': nan}[0m [36mstep[0m=[35m4000[0m
[2m2025-10-01 10:36.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_4000.d3[0m


Epoch 5/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.34[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=5 step=5000[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006096618175506592, 'time_algorithm_update': 0.006669107675552368, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.007326644659042358, 'td_error': nan}[0m [36mstep[0m=[35m5000[0m
[2m2025-10-01 10:36.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_5000.d3[0m


Epoch 6/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.43[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=6 step=6000[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006359171867370606, 'time_algorithm_update': 0.006754778623580933, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.0074475986957550045, 'td_error': nan}[0m [36mstep[0m=[35m6000[0m
[2m2025-10-01 10:36.43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_6000.d3[0m


Epoch 7/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.50[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=7 step=7000[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006105198860168457, 'time_algorithm_update': 0.005802124738693237, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.00646203351020813, 'td_error': nan}[0m [36mstep[0m=[35m7000[0m
[2m2025-10-01 10:36.50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_7000.d3[0m


Epoch 8/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:36.59[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=8 step=8000[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006413362026214599, 'time_algorithm_update': 0.0067825338840484615, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.0074730064868927006, 'td_error': nan}[0m [36mstep[0m=[35m8000[0m
[2m2025-10-01 10:36.59[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_8000.d3[0m


Epoch 9/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:37.07[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=9 step=9000[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006965227127075195, 'time_algorithm_update': 0.006552992820739746, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.0072979569435119625, 'td_error': nan}[0m [36mstep[0m=[35m9000[0m
[2m2025-10-01 10:37.07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_9000.d3[0m


Epoch 10/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-10-01 10:37.15[0m [[32m[1minfo     [0m] [1mDiscreteCQL: epoch=10 step=10000[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006782474517822265, 'time_algorithm_update': 0.006672014713287353, 'loss': nan, 'td_loss': nan, 'conservative_loss': nan, 'time_step': 0.007398725986480713, 'td_error': nan}[0m [36mstep[0m=[35m10000[0m
[2m2025-10-01 10:37.15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL/model_10000.d3[0m

✅ Step 3: CQL 모델 학습 완료!


In [10]:
# ==============================================================================
# 4. 학습된 AI를 이용한 전략 추천
# ==============================================================================
sample_state = train_obs[0]
best_action_idx = int(cql.predict(np.expand_dims(sample_state, axis=0))[0])  # ← 배치 차원 추가

# 원 액션 라벨로 역매핑(가독성)
idx_to_action = {v: k for k, v in action_to_idx.items()}
best_action_label = idx_to_action.get(best_action_idx, best_action_idx)

action_map = {
    0: "전략 0 (배달 축소 & 신규 고객 유치 감소)",
    1: "전략 1 (배달 축소 & 신규 고객 유치 강화)",
    2: "전략 2 (배달 강화 & 신규 고객 유치 감소)",
    3: "전략 3 (배달 강화 & 신규 고객 유치 강화)"
}
readable = action_map.get(best_action_idx, f"액션 {best_action_idx}")

print("\n--- 🤖 AI 전략 추천 ---")
print(f"현재 가게 상태(분석 결과): {np.round(sample_state, 2)}")
print(f"예측 액션 인덱스: {best_action_idx} (원본 라벨: {best_action_label})")
print("\n" + "="*40)
print(f"AI 추천 최적 전략: {readable}")
print("="*40)


--- 🤖 AI 전략 추천 ---
현재 가게 상태(분석 결과): [ 2.40e-01 -1.70e-01 -2.00e-01 -3.20e-01 -1.00e-01 -3.00e-02  3.00e-02
  6.40e-01 -1.00e-01  1.07e+00  9.00e-02  5.60e-01  4.20e-01  2.50e-01
 -1.30e-01  1.70e-01       nan  0.00e+00  1.00e+02  6.00e+00  0.00e+00]
예측 액션 인덱스: 0 (원본 라벨: 0)

AI 추천 최적 전략: 전략 0 (배달 축소 & 신규 고객 유치 감소)


In [11]:
# ==============================================================================
# 5. 최종 모델 저장
# ==============================================================================
cql.save_model(MODEL_SAVE_FILE)
print(f"\n✅ 최종 학습된 AI 모델이 '{MODEL_SAVE_FILE}' 파일로 저장되었습니다.")
print("\n🎉 모든 핵심 AI 개발 과정을 완료했습니다! 🎉")


✅ 최종 학습된 AI 모델이 'cql_model.pt' 파일로 저장되었습니다.

🎉 모든 핵심 AI 개발 과정을 완료했습니다! 🎉
