This jupyter notebook teaches you how to interact with the real neuronal networks (static state function).

Before you run this code, set your group_id and password in "auth.py". Ask your mentor for your password/id.

In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Add parent directory to path
import sys
from pathlib import Path
current_dir = Path().resolve()
root_dir = current_dir.parent
if str(root_dir) not in sys.path:
    sys.path.insert(0,str(root_dir))

from Gyms.RealNetworkSync import RealNetworkSync
from Algorithms.MultiArmedBandit import MABAgent, MAB_STRATEGIES

In [2]:
# Define size of state and action spaces, as well as stimulation period
state_dim   = 4   # Dimension of reduced state space
action_dim  = 5   # Number of stimuli in action space
circuit_id  = 2   # Each group has 4 biological/simulated circuits. You choose here which one you want to use. Must be in {0,1,2,3}

In [3]:
# Create environment and initialize it
from Reward.TrainingReward import TrainingReward
env      = RealNetworkSync(action_dim=action_dim,state_dim=state_dim,circuit_id=circuit_id)
state, _ = env.reset()
env.render() # This function gives you the current state + reward, which both is 0 after initialization

Host/Port open and accessable
Current state: [0. 0. 0. 0.], Reward: 0


In [12]:
def run_mab_simulation(env, strategies, train_iters=21600, eval_iters=7200):
    """Run complete MAB simulation with multiple strategies"""
    
    # Initialize agents and storage
    agents = {s: MABAgent(strategy=s, n_actions=125, alpha=0.1) for s in strategies}
    results = {
        'train': {s: [] for s in strategies},
        'eval': {s: [] for s in strategies}
    }

    # Run training and evaluation for each strategy
    for strategy in strategies:
        state, _ = env.reset()
        
        agent = agents[strategy]
        
        # Training phase
        for _ in tqdm(range(train_iters), desc=f"Training {strategy}"):
            action_idx = agent.select_action()
            _, reward, _, _, _ = env.step(agent.action_map(action_idx))
            agent.update(action_idx, reward)
            results['train'][strategy].append(reward)

        # Evaluation phase (no updates)
        for _ in tqdm(range(eval_iters), desc=f"Evaluating {strategy}"):
            action_idx = agent.select_action() 
            _, reward, _, _, _ = env.step(agent.action_map(action_idx))
            results['eval'][strategy].append(reward)

    return pd.DataFrame(results['train']), pd.DataFrame(results['eval'])

def pad_lists(d):
    max_len = max(len(lst) for lst in d.values())
    return {k: lst + [np.nan]*(max_len - len(lst)) for k, lst in d.items()}

def run_mab_simulation_full(env, strategies, update_interval=1800):
    """Run MAB simulation with performance updates and stim_id handling"""
    
    results = {
        'train': {s: [] for s in strategies},
        'eval': {s: [] for s in strategies}
    }
    
    total_steps = 21600 + 7200  # Total steps per strategy
    
    for strategy in strategies:
        agent = MABAgent(strategy=strategy, n_actions=25, alpha=0.1)
        state, info = env.reset()
        stim_id = 1 # dummy value
        
        with tqdm(total=total_steps, desc=f"Running {strategy}") as pbar:
            while stim_id > 0:
                action_idx = agent.select_action()
                action = agent.action_map(action_idx)
                
                state, reward, terminated, truncated, info = env.step(action)
                stim_id = info['stim_id']

                if stim_id == 0:
                    tqdm.write("NETWORK RESET")
                    break

                # Phase determination and logging
                phase = 'train' if stim_id < 21600 else 'eval'
                results[phase][strategy].append(reward)
                
                # Agent updates only during training phase
                if phase == 'train':
                    agent.update(action_idx, reward)
                
                # Periodic performance updates and saving
                if stim_id % update_interval == 0:
                    try:
                        window = results[phase][strategy][-update_interval:]
                    
                        # Calculate metrics
                        avg_reward = np.mean(window) if window else 0
                        cum_reward = np.sum(window) if window else 0
                        
                        # Build parameter string
                        params = []
                        if agent.strategy in ['epsilon_greedy', 'hybrid']:
                            params.append(f"ε={agent.epsilon:.3f}")
                        if agent.strategy in ['boltzmann', 'hybrid']:
                            params.append(f"T={agent.temperature:.3f}")
                        if agent.strategy == 'ucb':
                            params.append(f"β={agent.ucb_beta:.1f}")
                        if agent.strategy == 'thompson':
                            explored = np.sum(agent.action_counts > 0)
                            params.append(f"Explored {explored}/{agent.n_actions}")
                        
                        # Construct update message
                        update_msg = (
                            f"{strategy} @ {stim_id} ({phase.upper()}): "
                            f"Avg={avg_reward:.2f} | Cum={cum_reward:.0f} | "
                            f"{', '.join(params)}"
                        )
                        tqdm.write(update_msg)
                        
                        # Save results to CSV files periodically
                        train_df = pd.DataFrame(pad_lists(results['train']))
                        eval_df = pd.DataFrame(pad_lists(results['eval']))
                        train_df.to_csv(f"results/train_{strategy}.csv")
                        eval_df.to_csv(f"results/eval_{strategy}.csv")
                    except Exception as e:
                        print(e)
                        continue
                
                pbar.update(1)
                
                if terminated or truncated:
                    break

    return pd.DataFrame(pad_lists(results['train'])), pd.DataFrame(pad_lists(results['eval']))


In [None]:
train_df, eval_df = run_mab_simulation_full(env, MAB_STRATEGIES)
train_df.to_csv('full_train_results.csv', index=False)
eval_df.to_csv('full_eval_results.csv', index=False)

Running epsilon_greedy:   4%|▍         | 1252/28800 [05:12<2:05:25,  3.66it/s]

epsilon_greedy @ 5400 (TRAIN): Avg=0.92 | Cum=1148 | ε=0.576


Running epsilon_greedy:  11%|█         | 3052/28800 [12:43<1:49:04,  3.93it/s]

epsilon_greedy @ 7200 (TRAIN): Avg=3.00 | Cum=5400 | ε=0.280


Running epsilon_greedy:  13%|█▎        | 3636/28800 [15:09<1:44:49,  4.00it/s]

In [None]:
train_df, eval_df = run_mab_simulation(env, MAB_STRATEGIES, train_iters=1_000, eval_iters=100)
train_df.to_csv('train_results.csv', index=False)
eval_df.to_csv('eval_results.csv', index=False)

In [None]:
def analyze_results(train_df, eval_df):
    """Generate comprehensive analysis of simulation results"""
    analysis = {
        'training': train_df.describe().T,
        'evaluation': eval_df.describe().T,
        'cumulative_rewards': {
            'train': train_df.cumsum(),
            'eval': eval_df.cumsum()
        }
    }
    return analysis

results_analysis = analyze_results(train_df, eval_df)

In [None]:
def plot_cumulative_rewards(results_analysis):
    """Plot training and evaluation rewards with clear phase separation"""
    plt.figure(figsize=(14, 8))
    
    # Extract cumulative rewards from analysis
    train_cum = results_analysis['cumulative_rewards']['train']
    eval_cum = results_analysis['cumulative_rewards']['eval']

    # Plot training curves
    for strategy in train_cum.columns:
        plt.plot(
            train_cum.index, 
            train_cum[strategy], 
            label=f'{strategy} (train)',
            alpha=0.6,
            linewidth=1.5
        )

    # Plot evaluation curves
    eval_start = train_cum.index[-1] + 1
    for strategy in eval_cum.columns:
        plt.plot(
            eval_cum.index + eval_start,
            eval_cum[strategy],
            label=f'{strategy} (eval)',
            linestyle='--',
            linewidth=2.5
        )

    # Formatting
    plt.axvline(eval_start, color='gray', linestyle=':', label='Train/Eval Boundary')
    plt.fill_betweenx(
        y=[train_cum.min().min(), train_cum.max().max()],
        x1=eval_start,
        x2=eval_start + len(eval_cum),
        color='gray',
        alpha=0.1
    )
    
    plt.title('Strategy Performance Across Training and Evaluation Phases')
    plt.xlabel('Simulation Step (stim_id)')
    plt.ylabel('Cumulative Reward')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Usage with your existing analysis object
plot_cumulative_rewards(results_analysis)

In [None]:
# Example initialization of a MAB agent
agent = MABAgent(
    epsilon=0.9,            # Initial exploration rate (90% random actions)
    alpha=0.1,              # Learning rate (constant step-size)
    initial_q=0.0,          # Optimistic initial values
    n_actions=125,           # Number of actions, should be a power of 5
)

# Example code, that stimulates the network 100 times with a randomly sampled action, while calculating also the average reward received

total_reward = 0
action_count = 0

rewards_over_time = []

for _ in range(100):
    action_idx = agent.select_action()
    action = agent.action_map(action_idx)
    print(f"Stimulate with action: {action}")
    
    state, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    action_count += 1

    rewards_over_time.append(reward)

    agent.update(action_idx, reward)

    # Plot information
    print(f"Info: {info}")
    print(f"State: {state}, Reward: {reward}")

    print("-----------------------------")