# TD LEARNING

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm

In [None]:
def td_leaning(alpha, stimuli, rewards, n_trials):
        """
    Temporal Difference learning 

    Args:
        alpha (float): Learning rate (0 <= alpha <= 1).
        stimuli (list): A list of positions (state coordinates) visited during the trajectory.
        rewards (float): Discount factor (0 <= gamma < 1).
        n_trials (): Number of trials
    
    Returns:
        numpy array: Updated successor representation.
    """
        
    V = np.zeros(len(stimuli))  # Initialize value array for time points
    w = np.zeros(len(stimuli))  # Initialize weight for each time point
    delta = np.zeros([n_trials, len(stimuli)])  # Initialize 2D array for trial x time
    delta_v = np.zeros(len(stimuli))  # Initialize array for delta v

    # Temporal difference learning: update predictions for each trial
    for n in tqdm(range(n_trials)):
        for t in range(1, len(stimuli)):
            # Compute the value at time t using weighted sum of previous stimuli
            V[t] = np.sum(w[:t] * stimuli[t-1::-1])  # Value computation (weighted sum of previous stimuli)

            # Compute delta_v (prediction difference)
            delta_v[t-1] = V[t] - V[t-1]

            # Compute delta (TD error) for each time step
            delta[n, t] = rewards[t] + delta_v[t]

            # Update weights using the learning rule
            for tau in range(t):
                w[tau] += learning_rate * delta[n, t] * stimuli[t - tau]

    return w, V, delta_v, delta

In [16]:
# Define parameters for the TD model
num_trials = np.arange(100)
time_steps = np.arange(300)
reward_time = 200
stimulus_time = 100
learning_rate = 0.1

v_before = np.zeros(len(time_steps)) # value array
delta_v_before = np.zeros(len(time_steps)) # prediction error array

# set stimulus to 1 at time step 100
u = np.zeros((len(time_steps), 1)) # stimuli array
u[100] = 1

# set reward at time step 200
r = np.zeros((len(time_steps), 1)) # reward array

# now we fit a gaussian curve to the reward
sigma = 5  # Standard deviation for Gaussian
r = gaussian_filter1d(r, sigma)

# Normalize to sum to 2
r *= 2 / np.sum(r)

  r *= 2 / np.sum(r)
  r *= 2 / np.sum(r)


In [17]:
# call the temporal difference learning function
W_after, V_after, delta_v_after, delta_weight_after = td_leaning(learning_rate, u, r, num_trials)

TypeError: only integer scalar arrays can be converted to a scalar index