In [1]:
import os
import torch
from datetime import datetime
import numpy as np
from multiprocessing import Pool, cpu_count
from bandit_task import TwoArmedBandit
from model import RNNActorCritic
from episode import collect_bandit_trajectory
from reptile_bandit_training import *

base_unix = "/storage1/fs1/shinung/Active/jackosvky/LearningToLearn"
base_windows = "//storage1.ris.wustl.edu/shinung/Active/jackosvky/LearningToLearn"
base_mac = "/Volumes/shinung/Active/jackosvky/LearningToLearn"

# Check which base path exists
if os.path.exists(base_unix):
    base = base_unix
elif os.path.exists(base_windows):
    base = base_windows
elif os.path.exists(base_mac):
    base = base_mac
else:
    base = os.getcwd()
    print(f"None of the base paths are available. Creating folder in the current directory: {base}")

current_time = "2026-02-17_21-48-26"
new_folder = os.path.join(base, current_time)
#os.makedirs(new_folder, exist_ok=True)
os.chdir(new_folder)  

In [None]:
# Instantiate and train
model = RNNActorCritic(input_size=3, hidden_size=32, action_size=2)  # input = 2 (action) + 1 (reward)
model.load_state_dict(torch.load("reptile_bandit_model.pth", weights_only=True))

# Test the trained model
test_p_values = [0.1, 0.3, 0.5, 0.7, 0.9]
test_results = test_model_performance(
    model, 
    p_values=test_p_values,
    n_test_episodes=10,
    n_rounds=50,
    inner_steps=0,  # Test zero-shot performance (set to 5 for few-shot adaptation)
    inner_lr=0.02,
    return_trajectories=True
)



TESTING MODEL PERFORMANCE

p = 0.10 (Optimal arm: 1, Expected reward: 0.90)
  Avg Reward: 5.50 ± 2.06 (out of 50)
  Avg Optimal Arm Selection: 1.8% ± 1.9%
  Efficiency: 12.2%

p = 0.30 (Optimal arm: 1, Expected reward: 0.70)
  Avg Reward: 15.80 ± 3.97 (out of 50)
  Avg Optimal Arm Selection: 1.4% ± 1.3%
  Efficiency: 45.1%

p = 0.50 (Optimal arm: 0, Expected reward: 0.50)
  Avg Reward: 24.20 ± 4.19 (out of 50)
  Avg Optimal Arm Selection: 97.6% ± 2.0%
  Efficiency: 96.8%

p = 0.70 (Optimal arm: 0, Expected reward: 0.70)
  Avg Reward: 35.40 ± 4.20 (out of 50)
  Avg Optimal Arm Selection: 99.4% ± 0.9%
  Efficiency: 101.1%

p = 0.90 (Optimal arm: 0, Expected reward: 0.90)
  Avg Reward: 44.00 ± 2.00 (out of 50)
  Avg Optimal Arm Selection: 98.8% ± 1.3%
  Efficiency: 97.8%

SUMMARY
Overall Average Reward: 24.98
Overall Optimal Arm Selection: 59.8%

