# Environment Setup

In [None]:
#Pretty standard stuff here

!mkdir PongReinforcementLearning
!cd PongReinforcementLearning

# Then, I set up a virtual environment (venv)
python -m venv PongReinforcementLearningVENV
!source PongReinforcementLearningVENV/bin/activate

# Make the venv recognizable to Jupyter Notebooks.
# This is the bridge that connects Jupyter to my isolated Python environment.
%pip install ipyconfig
python -m ipykernel install --user --name=PongReinforcementLearningVENV

# Time to fire up Jupyter Notebook.
# Make sure to select the new venv as the Python interpreter.
jupyter notebook

# Finally, installing some libs, i usually do these via the console but Jupyter's % operator usually works just fine
%pip3 install pygame
%pip install numpy
%pip install pandas
%pip install tabulate
%pip install torch torchvision
%pip install matplotlib

# Pong

In [4]:
import pygame
import random
import numpy as np  
import pickle
import os
import math
import logging
import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

class QNetwork_00(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_00, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

    
class QNetwork_01(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_01, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class QNetwork_02(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_02, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)
    
class QNetwork_03(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_03, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)
    
class QNetwork_04(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_04, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)
    
class QNetwork_05(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_05, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
class QNetwork_06(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork_06, self).__init__()
        self.fc1 = nn.Linear(input_dim, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        return self.fc3(x)


#Convert input coordinate to discrete grid space.  this smaller grid space should make learning easier.
def discretize_grid(coordinate, game_board_grid_size): 
    return abs(coordinate // game_board_grid_size)

def discretize_paddle_step(coordinate, paddle_step_size): 
    return abs(coordinate // paddle_step_size)

#Convert velocity into discretized space (of only 4 options!)
def discretize_velocity(velocity_x, velocity_y):
    if velocity_x > 0 and velocity_y > 0:
        return 0  # Up-Right
    elif velocity_x > 0 and velocity_y < 0:
        return 1  # Down-Right
    elif velocity_x < 0 and velocity_y > 0:
        return 2  # Up-Left
    elif velocity_x < 0 and velocity_y < 0:
        return 3  # Down-Left
    
def calculate_reward(y_position, reward_for_hitting_ball, height):
    min_reward = reward_for_hitting_ball   # Minimum reward at the edges
    max_reward = 50 * reward_for_hitting_ball  # Maximum reward at the center
    mean = height / 2  # Center of the screen
    std_dev = height / 4  # Standard deviation, configurable
    gaussian_value = np.exp(-((y_position - mean) ** 2) / (2 * std_dev ** 2))
    reward = min_reward + (max_reward - min_reward) * gaussian_value
    return reward
    
# Visualization of the neural network's current recommendation for various paddle positions
def draw_q_values_bar(window, model, game_board_grid_size, ball_pos, ball_velocity):
    bar_x = 0  # Starting x-coordinate of the bar
    bar_width = 50  # Width of the bar
    
    discretized_ball_x = discretize_grid(ball_pos[0], game_board_grid_size)
    discretized_ball_y = discretize_grid(ball_pos[1], game_board_grid_size)
    discretized_velocity = discretize_velocity(ball_velocity[0], ball_velocity[1])

    for y in range(0, window.get_height(), game_board_grid_size):
        discretized_fake_paddle_y = discretize_grid(y, game_board_grid_size)
        
        # Will have to update this any time I change the definition of state
        state = (discretized_fake_paddle_y, discretized_ball_x, discretized_ball_y, discretized_velocity)
        state_tensor = torch.FloatTensor([state])
        
        with torch.no_grad():
            q_values_output = model(state_tensor)
            q_values = q_values_output[0] if len(q_values_output.shape) > 1 else q_values_output

        # Find the action with the highest Q-value
        max_action = torch.argmax(q_values).item()
        
        # Set color based on the action
        display_action = '-'
        if max_action == 0:
            color = [255, 0, 0]  # Red for Move Up
            display_action = 'U'
        elif max_action == 1:
            color = [0, 0, 255]  # Blue for Move Down
            display_action = 'D'
        else:
            color = [0, 0, 0]  # Black for Stay Still
            display_action = 'S'
            
        #Pull out the max value
        max_q_value = q_values[max_action].item()
        
        #Pick the color
        color = [int(c) for c in color]

        # Calculate opacity based on an absolute range
        #max_q_value_range_expected = 2.5
        #min_q_value_range_expected = -2.5
        #min_opacity = 20  # Configurable minimum opacity
        #q_value_range = max_q_value_range_expected - min_q_value_range_expected
        #normalized_max_q_value = (max_q_value - min_q_value_range_expected) / q_value_range
        #opacity = int(normalized_max_q_value * 255)
        #opacity = max(min_opacity, min(255, int(opacity)))
        
        #Calculate opacity based on relative strength of recommended action vs the other actions
        #max_q_value_range_expected = 2.5
        #min_q_value_range_expected = -2.5
        #average_q_value = torch.mean(q_values).item()
        #q_value_difference = max_q_value - average_q_value
        #max_possible_difference = max_q_value_range_expected - average_q_value
        #normalized_difference = q_value_difference / max_possible_difference
        #opacity = int(normalized_difference * 255)
        #min_opacity = 20  # Configurable minimum opacity
        #opacity = max(min_opacity, min(255, int(opacity)))
        opacity = 255
  
        # Create a new surface for the rectangle
        rect_surface = pygame.Surface((bar_width, game_board_grid_size), pygame.SRCALPHA)
    
        try:
            # Fill the new surface
            rect_surface.fill(tuple(color + [opacity]))
        except ValueError as e:
            print(f"Error: {e}")
            print(f"Debug Info - Color: {color}, Opacity: {opacity}")
        
        # Fill the new surface
        rect_surface.fill(tuple(color + [opacity]))

        # Blit the new surface onto the window
        window.blit(rect_surface, (bar_x, y))
        
        # Display the strength of the q-value
        if game_board_grid_size >= 20:
            font = pygame.font.SysFont(None, 16)
            q_value_display = font.render(f"{display_action}:{max_q_value:.9g}", True, (255, 255, 255))
            window.blit(q_value_display, (5, y + game_board_grid_size // 2))

        # Print the Q-values, chosen color, and chosen opacity
        #print(f"Q-values: {q_values.tolist()}, color:{color} opacity:{opacity} max_q_value:{max_q_value}")

#Main Pong game function, accepts key parameters as inputs now
def play_de_game(which_neural_net, episodes_to_run, update_screen, watch_every_x_episodes, alpha, gamma, epsilon, epsilon_min, epsilon_decay, game_board_grid_size, paddle_step_size, reward_for_winning_episode, punishment_for_losing_episode, reward_for_hitting_ball, width, height, backpropagation_state_buffer_size):
    
    #Instanciate neural net
    left_agent_model = {}
    input_dim = 4
    output_dim = 3
    if which_neural_net == 1:
        left_agent_model = QNetwork_01(input_dim, output_dim)
    elif which_neural_net == 2:
        left_agent_model = QNetwork_02(input_dim, output_dim)
    elif which_neural_net == 3:
        left_agent_model = QNetwork_03(input_dim, output_dim)
    elif which_neural_net == 4:
        left_agent_model = QNetwork_04(input_dim, output_dim)
    elif which_neural_net == 5:
        left_agent_model = QNetwork_05(input_dim, output_dim)
    else:
        left_agent_model = QNetwork_00(input_dim, output_dim)
    
    #Instanciate optimizer
    left_optimizer = optim.Adam(left_agent_model.parameters(), lr=0.1) #default lr=0.001
    
    # Calculate loss and update neural networks
    #loss_fn = nn.MSELoss()
    loss_fn = nn.SmoothL1Loss()
    
    # Initialize buffers for left and right agents
    backpropagation_state_buffer_left = []
    backpropagation_new_state_buffer_left = []
    
    #Key Results
    KR_max_episode_length = 0
    KR_avg_episode_length_every_100_episodes = {}
    KR_avg_epsilon_every_100_episodes = {}

    # Initialize scores
    left_score = 0
    right_score = 0
    
    # Initial paddle positions
    #left_paddle_pos = [50, height // 2 - paddle_height // 2]
    #right_paddle_pos = [width - 50 - paddle_width, height // 2 - paddle_height // 2]
    # Paddle positions to a random spot
    left_paddle_pos = [50, random.randint(0, height - paddle_height)]
    right_paddle_pos = [width - 50 - paddle_width, random.randint(0, height - paddle_height)]
    
    # Initial Ball position and velocity
    ball_pos = [width // 2, height // 2]
    ball_velocity = [random.choice([-4, 4]), random.choice([-4, 4])]
    
    # Initialize episode metrics
    episode_count = 0
    this_episode_length = 0
    last_100_episode_lengths = []
    last_100_epsilons = []
    
    # Init whether each AI agent has hit the ball in this episode yet
    contact_with_ball_made_this_loop = False
    left_player_won = False
    right_player_won = False
    
    # Create the state representation for both agents, this captures the state at the end of the previous loop
    state_left = (0, 0, 0, 0)
    action_left = 0
    action_right = 0
    
    # Init exploration vars
    exploration_direction = None
    exploration_length = 0
    
    # Init results
    results = {}
    
    run = True
    user_quit = False
    user_paused = False
    while run:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                run = False
                user_quit = True
                
        #pygame.time.delay(30)
        
        #Toggle update_screen
        keys = pygame.key.get_pressed()
        if keys[pygame.K_v]:
            if update_screen:
                update_screen = False
                pygame.time.delay(100)
            else:
                update_screen = True
                pygame.time.delay(100)
        
        #Toggle pause status
        keys = pygame.key.get_pressed()
        if keys[pygame.K_p]:
            if user_paused:
                user_paused = False
                pygame.time.delay(100)
            else:
                user_paused = True
                pygame.time.delay(100)
        
        #If the game is paused
        if user_paused:
            continue
        
        #Track game loops in this episode/game and report to screen to get a sense of how many loops a game lasts
        this_episode_length += 1

        #Track whether we have a rewarded event in this loop
        contact_with_ball_made_this_loop = False
        left_player_won = False
        right_player_won = False

        # Reset rewards to 0 at the beginning of each pass through the game loop
        reward_left = 0

        # Update the backpropagation buffers (note there's also one for new state)
        if len(backpropagation_state_buffer_left) >= backpropagation_state_buffer_size: # If we're at the size limit
           backpropagation_state_buffer_left.pop(0)  # Remove the oldest left state
        backpropagation_state_buffer_left.append(state_left)  # Add the current left state
        
        # Convert states to tensors
        state_left_tensor = torch.FloatTensor([state_left])
    
        # Get Q-values from neural network
        with torch.no_grad():
            q_values_left = left_agent_model(state_left_tensor)
            
        # Basic mode - Choose actions using epsilon-greedy policy
        #action_left = torch.argmax(q_values_left).item() if np.random.rand() >= epsilon else np.random.choice(action_space)
        #if np.random.rand() >= epsilon:
        #    action_left = torch.argmax(q_values_left).item()
        #else:
        #    action_left = np.random.choice(action_space)
        
        # More advanced exploration algo
        if np.random.rand() >= epsilon:
            action_left = torch.argmax(q_values_left).item()
            exploration_direction = None  # Reset exploration
        else:
            if exploration_direction is None or exploration_length == 0:
                # Start a new exploration
                exploration_direction = np.random.choice(action_space)
                exploration_length = np.random.randint(5, height // game_board_grid_size)  # Random length between 5 and a num that will let paddle traverse entire grid
            else:
                # Continue existing exploration
                exploration_length -= 1
            action_left = exploration_direction
        
        # Manual human paddle movement with boundary checks
        keys = pygame.key.get_pressed()
        #if keys[pygame.K_w] and left_paddle_pos[1] > 0:
        #   left_paddle_pos[1] -= paddle_step_size
        #if keys[pygame.K_s] and left_paddle_pos[1] < height - paddle_height:
        #   left_paddle_pos[1] += paddle_step_size
        #if keys[pygame.K_UP] and right_paddle_pos[1] > 0:
        #   right_paddle_pos[1] -= paddle_step_size
        #if keys[pygame.K_DOWN] and right_paddle_pos[1] < height - paddle_height:
        #   right_paddle_pos[1] += paddle_step_size
        
        # Ideal Paddle Control of Right paddle
        paddle_center_offset = paddle_height / 2
        ideal_paddle_y = ball_pos[1] - paddle_center_offset
        right_paddle_center = right_paddle_pos[1] + paddle_center_offset
        if ideal_paddle_y > right_paddle_center and right_paddle_pos[1] < height - paddle_height:
            right_paddle_pos[1] += paddle_step_size # Move Down
        elif ideal_paddle_y < right_paddle_center and right_paddle_pos[1] > 0:
            right_paddle_pos[1] -= paddle_step_size # Move Up
        # else: Stay put (no code needed for this)
        
        #Left AI agent moves the paddle!! 
        if action_left == 0 and left_paddle_pos[1] > 0:  # Move Up
            left_paddle_pos[1] -= paddle_step_size 
        elif action_left == 1 and left_paddle_pos[1] < height - paddle_height:  # Move Down
            left_paddle_pos[1] += paddle_step_size  
        #elif action_left == 2: 
            # Stay Still, so no movement
            
        # Debugging code to print current state and action for both agents
        #print(f"Current State Left: {state_left}, Action Taken Left: {action_left}")
        #print(f"Current State Right: {state_right}, Action Taken Right: {action_right}")

        # Update ball position
        ball_pos[0] += ball_velocity[0]
        ball_pos[1] += ball_velocity[1]

        # Collision detection with walls
        if ball_pos[1] <= 0 or ball_pos[1] >= height:
            ball_velocity[1] = -ball_velocity[1]
              
        # Collision detection with paddles
        collision_offset = 5  # Define an offset to push the ball away from the paddle
        if (left_paddle_pos[0] <= ball_pos[0] <= left_paddle_pos[0] + paddle_width and
            left_paddle_pos[1] <= ball_pos[1] <= left_paddle_pos[1] + paddle_height):
            ball_velocity[0] = -ball_velocity[0]
            ball_pos[0] += collision_offset  # Push the ball away from the paddle
            reward_left += calculate_reward(discretize_paddle_step(left_paddle_pos[1], paddle_step_size), reward_for_hitting_ball, height)  # Add reward for left agent
            contact_with_ball_made_this_loop = True
        elif (right_paddle_pos[0] <= ball_pos[0] <= right_paddle_pos[0] + paddle_width and
              right_paddle_pos[1] <= ball_pos[1] <= right_paddle_pos[1] + paddle_height):
            ball_velocity[0] = -ball_velocity[0]
            ball_pos[0] -= collision_offset  # Push the ball away from the paddle     

        # Ball reset, scoring, and immediate feedback game-over condition
        episode_just_ended = False
        if ball_pos[0] < 0:
            right_player_won = True
            episode_just_ended = True
            right_score += 1  # Right player scores
            reward_left += punishment_for_losing_episode  # Punishment for the left agent
        elif ball_pos[0] > width:
            left_player_won = True
            episode_just_ended = True
            left_score += 1  # Left player scores
            reward_left += reward_for_winning_episode
                
        #All the end-of-episode stuff
        if episode_just_ended:
            #Record some key metrics to help optimize later
            if this_episode_length > KR_max_episode_length: 
                KR_max_episode_length = this_episode_length
            last_100_episode_lengths.append(this_episode_length)
            if episode_count % 100 == 0: # Only record avg episode length every 100 eps
                last_100_episode_avg = sum(last_100_episode_lengths) / len(last_100_episode_lengths)
                KR_avg_episode_length_every_100_episodes[episode_count] = last_100_episode_avg
                last_100_episode_lengths = [] # and reset the rolling history
            last_100_epsilons.append(epsilon)
            if episode_count % 100 == 0: # Only record avg epsilons every 100 eps
                last_100_epsilon_avg = sum(last_100_epsilons) / len(last_100_epsilons)
                KR_avg_epsilon_every_100_episodes[episode_count] = last_100_epsilon_avg
                last_100_epsilons = [] # and reset the rolling history   
            # Reset paddle positions to the middle
            left_paddle_pos = [50, height // 2 - paddle_height // 2]
            right_paddle_pos = [width - 50 - paddle_width, height // 2 - paddle_height // 2]
            # Reset paddle positions to a random spot
            #left_paddle_pos = [50, random.randint(0, height - paddle_height)]
            #right_paddle_pos = [width - 50 - paddle_width, random.randint(0, height - paddle_height)]
            #Reset the ball to the center in a random direction
            ball_pos = [width // 2, height // 2]
            ball_velocity = [random.choice([-4, 4]), random.choice([-4, 4])]
            #ball_velocity = [-4, 4] # For testing, always left/down   
            if epsilon > epsilon_min: # Decay epsilon at the end of a game/episode
                epsilon *= epsilon_decay    
            episode_count += 1  # Increment episode count
            this_episode_length = 0 # Reset length of episode
            
         # After taking an action, observe new state and reward
        #print('NEW STATE ', 'pad_y/disc:', left_paddle_pos[1], ':', discretize_paddle_step(left_paddle_pos[1], paddle_step_size), ' ball_x/disc:', ball_pos[0], ':', discretize_grid(ball_pos[0], game_board_grid_size), ' ball_y/disc:', ball_pos[1], ':', discretize_grid(ball_pos[1], game_board_grid_size), ' vel/disc:', ball_velocity[0], ':', ball_velocity[1], '->', discretize_velocity(ball_velocity[0], ball_velocity[1]))
        new_state_left = (discretize_paddle_step(left_paddle_pos[1], paddle_step_size), discretize_grid(ball_pos[0], game_board_grid_size), discretize_grid(ball_pos[1], game_board_grid_size), discretize_velocity(ball_velocity[0], ball_velocity[1]))
        #new_state_left = (discretize_paddle_step(left_paddle_pos[1], paddle_step_size), 0, discretize_grid(ball_pos[1], game_board_grid_size), 0)
        
        
        # Update the backpropagation buffers for new_state
        if len(backpropagation_new_state_buffer_left) >= backpropagation_state_buffer_size: # If we're at the size limit
           backpropagation_new_state_buffer_left.pop(0)  # Remove the oldest left state
        backpropagation_new_state_buffer_left.append(new_state_left)  # Add the current left state
        
        # Simple Backpropagation to NN
        if new_state_left != state_left:
            #print(' episode: ', episode_count, 'length: ', this_episode_length, ' reward_left: ', reward_left, ' state: ', state_left, ' new_state:', new_state_left)
            target_left = reward_left + gamma * torch.max(left_agent_model(torch.FloatTensor([new_state_left])))
            predicted_left = left_agent_model(state_left_tensor)[0][action_left]
            loss_left = loss_fn(predicted_left, target_left)
            left_optimizer.zero_grad()
            loss_left.backward()
            left_optimizer.step()
        #else:
            #print(' episode: ', episode_count, 'length: ', this_episode_length, ' reward_left: ', reward_left, ' state: ', state_left, ' new_state:', new_state_left, ' <- state unchanged');
        
        # Proximity reward: check if left state has changed or reward was applied
        proximity_min_reward = 0.0001
        proximity_max_reward = 1.0
        if new_state_left != state_left or reward_left != 0:
            # Proximity rewards, for getting the paddle close to 
            paddle_center_y_left = discretize_paddle_step(left_paddle_pos[1], paddle_step_size) + paddle_height / 2  
            abs_distance_left = abs(discretize_grid(ball_pos[0], game_board_grid_size) - discretize_paddle_step(paddle_center_y_left, paddle_step_size))
            scaled_reward_left = proximity_min_reward + (proximity_max_reward - proximity_min_reward) * (abs_distance_left / height) # Scale the reward linearly with the distance
            scaled_reward_left = min(max(scaled_reward_left, proximity_min_reward), proximity_max_reward)
            reward_left += scaled_reward_left
    
        # Backpropagation to NN using buffer, only rewarding for the reward event
        #if reward_left != 0:
        #    # Replay state buffer
        #    for idx, state_bp_tmp_left in enumerate(backpropagation_state_buffer_left):
        #        state_bp_tmp_left_tensor = torch.FloatTensor([state_bp_tmp_left])
        #        # Apply reward only to the most recent frame
        #        target_left = (reward_left if idx == len(backpropagation_state_buffer_left) - 1 else 0) + gamma * torch.max(left_agent_model(torch.FloatTensor([new_state_left])))
        #        predicted_left = left_agent_model(state_bp_tmp_left_tensor)[0][action_left]
        #       loss_left = loss_fn(predicted_left, target_left)
        #        left_optimizer.zero_grad()
        #        loss_left.backward()
        #        left_optimizer.step()
            
        #if contact_with_ball_made_this_loop:
        #    #print('START REPLAYING BUFFER DUE TO CONTACT WITH PADDLE')
        #    def polynomial_decay(idx, max_idx, exponent=2): 
        #        return ((max_idx - idx) / max_idx) ** exponent
        #    buffer_length = len(backpropagation_state_buffer_left)
        #    for idx, (state_bp_tmp_left, next_state_tmp_left) in enumerate(zip(backpropagation_state_buffer_left, backpropagation_new_state_buffer_left)):
        #        state_bp_tmp_left_tensor = torch.FloatTensor([state_bp_tmp_left])
        #        weighted_reward_left = reward_left * polynomial_decay(buffer_length - 1 - idx, buffer_length - 1) # Polynomial decay
        #        #weighted_reward_left = reward_left
        #        #print(' ep: ', episode_count, 'loop: ', this_episode_length, ' weighted_reward: ', weighted_reward_left, ' state:', state_bp_tmp_left, ' next_state: ', next_state_tmp_left)
        #        target_left = weighted_reward_left + gamma * torch.max(left_agent_model(torch.FloatTensor([next_state_tmp_left]))) 
        #        predicted_left = left_agent_model(state_bp_tmp_left_tensor)[0][action_left]
        #        loss_left = loss_fn(predicted_left, target_left)
        #        left_optimizer.zero_grad()
        #        loss_left.backward()
        #        left_optimizer.step()
          
        #if right_player_won:
        #    #print('START REPLAYING BUFFER DUE TO LEFT PLAYER LOSS')
        #    def polynomial_decay(idx, max_idx, exponent=2): 
        #        return ((max_idx - idx) / max_idx) ** exponent
        #    buffer_length = len(backpropagation_state_buffer_left)
        #    for idx, (state_bp_tmp_left, next_state_tmp_left) in enumerate(zip(backpropagation_state_buffer_left, backpropagation_new_state_buffer_left)):
        #        state_bp_tmp_left_tensor = torch.FloatTensor([state_bp_tmp_left])
        #        weighted_reward_left = reward_left * polynomial_decay(buffer_length - 1 - idx, buffer_length - 1)
        #        #print(' ep: ', episode_count, 'loop: ', this_episode_length, ' weighted_reward: ', weighted_reward_left, ' state:', state_bp_tmp_left, ' next_state: ', next_state_tmp_left)
        #        target_left = weighted_reward_left + gamma * torch.max(left_agent_model(torch.FloatTensor([next_state_tmp_left])))
        #        predicted_left = left_agent_model(state_bp_tmp_left_tensor)[0][action_left]
        #        loss_left = loss_fn(predicted_left, target_left)
        #        left_optimizer.zero_grad()
        #        loss_left.backward()
        #        left_optimizer.step()

        # Update current state for next iteration
        state_left = new_state_left
        
        #Intermittent visualization so that i can build intuition but run fast episodes between
        #if episode_count % watch_every_x_episodes == 0:
        #    intermittent_visualization_is_on = True
        #else:
        #    intermittent_visualization_is_on = False

        if update_screen and episode_count % watch_every_x_episodes == 0:
            # Draw paddles, ball, and scores
            window.fill((0, 0, 0))  # Clear screen
            pygame.draw.rect(window, (255, 255, 255), left_paddle_pos + [paddle_width, paddle_height])
            pygame.draw.rect(window, (255, 255, 255), right_paddle_pos + [paddle_width, paddle_height])
            pygame.draw.circle(window, (255, 255, 255), ball_pos, ball_radius)

            # Display scores
            font = pygame.font.SysFont(None, 20)
            score_display = font.render(f"score: {left_score} - {right_score}", True, (255, 255, 255))
            window.blit(score_display, (width // 2 - 40, 10))

            # Display episode count
            font = pygame.font.SysFont(None, 20)
            episode_display = font.render(f"episodes played: {episode_count}", True, (255, 255, 255))
            window.blit(episode_display, (width // 2 - 50, 30))
            
            # Display epsilon
            font = pygame.font.SysFont(None, 20)
            epsilon_display = font.render(f"epsilon:{epsilon:.2g}", True, (255, 255, 255))
            window.blit(epsilon_display, (width // 2 - 50, 50))
            
            #Visualize neural net recommendations for various paddle locations
            draw_q_values_bar(window, left_agent_model, game_board_grid_size, ball_pos, ball_velocity)

            pygame.display.update()

        if episode_count > episodes_to_run and episodes_to_run > 0:
            run = False
    
    results = {
        'config': {
            'which_neural_net': which_neural_net,
            'alpha': alpha,
            'gamma': gamma,
            'epsilon': epsilon,
            'epsilon_min': epsilon_min,
            'epsilon_decay': epsilon_decay,
            'game_board_grid_size': game_board_grid_size,
            'paddle_step_size': paddle_step_size,
            'reward_for_winning_episode': reward_for_winning_episode,
            'punishment_for_losing_episode': punishment_for_losing_episode,
            'reward_for_hitting_ball': reward_for_hitting_ball,
            'backpropagation_state_buffer_size': backpropagation_state_buffer_size
        },
        'metrics': {
            'KR_avg_episode_length_every_100_episodes': KR_avg_episode_length_every_100_episodes,
            'KR_avg_epsilon_every_100_episodes': KR_avg_epsilon_every_100_episodes,
            'KR_max_episode_length': KR_max_episode_length
        }
    }
    
    print(results)
    
    return results, user_quit, left_agent_model

# Define the action space
action_space = [0, 1, 2]  # 0: Move Up, 1: Move Down, 2: Stay Still

# Initialize Pygame
pygame.init()

# Create a window
width, height = 800, 600  # Window dimensions
window = pygame.display.set_mode((width, height))
pygame.display.set_caption('AI Learns Pong')

# Initialize a dictionary to store all results
all_results = {}

# Flag to check if the user wants to quit
user_quit = False

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize paddle and ball attributes
paddle_width, paddle_height = 20, 100
ball_radius = 15

# Define ranges and step sizes for key parameters
which_neural_net_range = [5] # [0, 1, 2, 3, 4, 5, 6] different size neural nets, hard-coded above
alpha_range = [0.8] # [0.1, 0.15, 0.2] If α is 1, I consider only the most recent information. If α is 0, I learn nothing and stick to my prior knowledge. 
gamma_range = [0.9] # [0.9, 0.95, 0.99] If γ is close to 1, I will consider future rewards with greater weight, making me more strategic but potentially slower to train.
epsilon_range = [.9] # [0.8, 0.9, 1.0]
epsilon_min_range = [0.01] # [0.05, 0.1, 0.15]
epsilon_decay_range = [0.995] # [0.995, 0.999, 0.9999] 
reward_for_winning_episode_range = [1] # [1, 2]
punishment_for_losing_episode_range = [-1] # [-1, -2]
reward_for_hitting_ball_range = [1] # [0.5, 1, 1.5]
game_board_grid_size_range = [20] # [10, 50, 100]
paddle_step_size_range = [15] # [5, 10, 15]
backpropagation_state_buffer_size_range = [1] # [5, 10, 15]

# For progress bar
total_runs = len(which_neural_net_range) * len(alpha_range) * len(gamma_range) * len(epsilon_range) * len(epsilon_min_range) * len(epsilon_decay_range) * len(reward_for_winning_episode_range) * len(punishment_for_losing_episode_range) * len(reward_for_hitting_ball_range) * len(game_board_grid_size_range) * len(paddle_step_size_range) * len(backpropagation_state_buffer_size_range)
completed_runs = 0

#Constants
DATA_FILE_PREFIX = 'v66-PyTorch-'
update_screen = True
watch_every_x_episodes = 1 # 1 to watch all
save_models = True

# TODO record equal numbers of rewards (paddle hits)
# Nested loops to iterate through each parameter combination
for alpha in alpha_range:
    for gamma in gamma_range:
        for epsilon in epsilon_range:
            for epsilon_min in epsilon_min_range:
                for epsilon_decay in epsilon_decay_range:
                    for reward_for_winning_episode in reward_for_winning_episode_range:
                        for punishment_for_losing_episode in punishment_for_losing_episode_range:
                            for reward_for_hitting_ball in reward_for_hitting_ball_range:
                                for paddle_step_size in paddle_step_size_range:
                                    for backpropagation_state_buffer_size in backpropagation_state_buffer_size_range:
                                        for game_board_grid_size in game_board_grid_size_range:
                                            for which_neural_net in which_neural_net_range:

                                                if user_quit:
                                                    break

                                                # Generate a unique identifier for this parameter combination
                                                param_id = f"which_neural_net-{which_neural_net}-alpha-{alpha}_gamma-{gamma}_epsilon-{epsilon}_epsilon_min-{epsilon_min}_epsilon_decay-{epsilon_decay}_reward_win-{reward_for_winning_episode}_punish_lose-{punishment_for_losing_episode}_reward_hit-{reward_for_hitting_ball}_grid_size-{game_board_grid_size}-paddle_step_size-{paddle_step_size}-backpropagation_state_buffer_size-{backpropagation_state_buffer_size}"
                                                logging.info(f"Running test for parameter set: {param_id}")
                                                start_time = datetime.datetime.now()

                                                # Run the game with the current parameter combination
                                                episodes_to_run = 5000
                                                results, user_quit, left_agent_model = play_de_game(which_neural_net, episodes_to_run, update_screen, watch_every_x_episodes, alpha, gamma, epsilon, epsilon_min, epsilon_decay, game_board_grid_size, paddle_step_size, reward_for_winning_episode, punishment_for_losing_episode, reward_for_hitting_ball, width, height, backpropagation_state_buffer_size)

                                                # Store the results in the all_results dictionary
                                                all_results[param_id] = results
                                                #logging.info(f"Completed test for parameter set: {param_id}")

                                                if save_models:
                                                    torch.save(left_agent_model.state_dict(), 'data/'+DATA_FILE_PREFIX+param_id+'-left_agent_model_state_dict.pth')

                                                completed_runs += 1
                                                percent_complete = (completed_runs / total_runs) * 100
                                                end_time = datetime.datetime.now()
                                                time_taken = end_time - start_time
                                                logging.info(f"Progress: {percent_complete:.2f}% Time taken: {time_taken} for {param_id[:20]}")

                                            if user_quit:
                                                break
                                        if user_quit:
                                            break
                                    if user_quit:
                                        break
                                if user_quit:
                                    break
                            if user_quit:
                                break
                        if user_quit:
                            break
                    if user_quit:
                        break
                if user_quit:
                    break
            if user_quit:
                break
        if user_quit:
            break
    if user_quit:
        break


# Save all_results to a pickle file
#if not user_quit:
with open("data/"+DATA_FILE_PREFIX+"ParamTest-All-Results.pkl", "wb") as f:
    pickle.dump(all_results, f)
    
print(all_results)

logging.info("All parameter tuning runs COMPLETED.")
        
pygame.quit()


INFO:root:Running test for parameter set: which_neural_net-5-alpha-0.8_gamma-0.9_epsilon-0.9_epsilon_min-0.01_epsilon_decay-0.995_reward_win-1_punish_lose--1_reward_hit-1_grid_size-20-paddle_step_size-15-backpropagation_state_buffer_size-1
INFO:root:Progress: 100.00% Time taken: 0:01:39.843479 for which_neural_net-5-a
INFO:root:All parameter tuning runs COMPLETED.


{'config': {'which_neural_net': 5, 'alpha': 0.8, 'gamma': 0.9, 'epsilon': 0.6179787838684802, 'epsilon_min': 0.01, 'epsilon_decay': 0.995, 'game_board_grid_size': 20, 'paddle_step_size': 15, 'reward_for_winning_episode': 1, 'punishment_for_losing_episode': -1, 'reward_for_hitting_ball': 1, 'backpropagation_state_buffer_size': 1}, 'metrics': {'KR_reward_events_left': 37, 'KR_ball_hits_left': 37, 'KR_avg_episode_length_every_100_episodes': {0: 265.0}, 'KR_avg_epsilon_every_100_episodes': {0: 0.9}, 'KR_max_episode_length': 1259}}
{'which_neural_net-5-alpha-0.8_gamma-0.9_epsilon-0.9_epsilon_min-0.01_epsilon_decay-0.995_reward_win-1_punish_lose--1_reward_hit-1_grid_size-20-paddle_step_size-15-backpropagation_state_buffer_size-1': {'config': {'which_neural_net': 5, 'alpha': 0.8, 'gamma': 0.9, 'epsilon': 0.6179787838684802, 'epsilon_min': 0.01, 'epsilon_decay': 0.995, 'game_board_grid_size': 20, 'paddle_step_size': 15, 'reward_for_winning_episode': 1, 'punishment_for_losing_episode': -1, 'rew

# Analysis

In [1]:
import pandas as pd
import pickle
from tabulate import tabulate
import matplotlib.pyplot as plt

# Load the data from the pickle file
with open("data/v66-PyTorch-ParamTest-All-Results.pkl", "rb") as f:
    all_results = pickle.load(f)

# Initialize empty lists to store config and metrics data
config_data = []
metrics_data = []

# Iterate through the all_results dictionary to separate config and metrics
for param_id, result in all_results.items():
    config = result['config']
    metrics = result['metrics']
    
    # Add a parameter ID to link config and metrics
    config['param_id'] = param_id
    metrics['param_id'] = param_id
    
    config_data.append(config)
    metrics_data.append(metrics)

# Convert lists of dictionaries to DataFrames
config_df = pd.DataFrame(config_data)
metrics_df = pd.DataFrame(metrics_data)

# Merge the config and metrics DataFrames on 'param_id'
df = pd.merge(config_df, metrics_df, on='param_id')

# Identify and sort the top runs by max average episode length over 100 episodes
df['max_avg_episode_length'] = df['KR_avg_episode_length_every_100_episodes'].apply(lambda x: max(x.values()))
sorted_df = df.sort_values(by='max_avg_episode_length', ascending=False)

# Take the top 5 runs
top_5_runs = sorted_df.head(5)

# Create a new figure for the plot
fig, ax1 = plt.subplots(figsize=(10, 6))

# Create a secondary y-axis for the average epsilon values
ax2 = ax1.twinx()

# Initialize lists to store config and metrics for text summary
config_list = []
metrics_list = []

# Loop through the top 5 runs to plot and collect summaries
for index, row in top_5_runs.iterrows():
    # Extract the config and metrics for this run
    config = row[['which_neural_net', 'alpha', 'gamma', 'epsilon', 'game_board_grid_size', 'backpropagation_state_buffer_size']]
    metrics_length = row['KR_avg_episode_length_every_100_episodes']
    metrics_epsilon = row['KR_avg_epsilon_every_100_episodes']
    
    # Generate a short identifier for the run
    short_id = f"Run_{index+1}"
    
    # Plotting episode length on the primary y-axis
    ax1.plot(list(metrics_length.keys()), list(metrics_length.values()), label=f"{short_id} - Episode Length")
    
    # Plotting average epsilon on the secondary y-axis
    ax2.plot(list(metrics_epsilon.keys()), list(metrics_epsilon.values()), linestyle='--', label=f"{short_id} - Avg Epsilon")
    
    # Collect data for Text Summary
    config['Run_ID'] = short_id
    config_list.append(config)
    metrics_list.append({"Run_ID": short_id, "Last_Avg_Ep_Length": list(metrics_length.values())[-1]})

# Add legend and labels to the primary y-axis
ax1.legend(loc='upper left')
ax1.set_xlabel('Episode Count')
ax1.set_ylabel('Average Episode Length over 100 Episodes')
ax1.set_title("Performance of Top 5 Runs")

# Add legend and labels to the secondary y-axis
ax2.legend(loc='upper right')
ax2.set_ylabel('Average Epsilon over 100 Episodes')

# Set the limits for the secondary y-axis
ax2.set_ylim([0, 1])

# Show the plot
plt.show()

# Print out the text summaries
print("Top Performing Runs:")
print(tabulate(config_list, headers="keys"))



FileNotFoundError: [Errno 2] No such file or directory: 'data/v66-PyTorch-ParamTest-All-Results.pkl'

# Parameter Tuning and Analysis for Reinforcement Learning in Pong

## Refactoring the Code

Initially, the Pong game was implemented in a monolithic script. To make it more modular and facilitate parameter tuning, the following steps were taken:

1. **Encapsulation**: The core game logic was encapsulated into a function called `play_de_game()`.
2. **Parameterization**: The function was designed to accept various hyperparameters as arguments, allowing for easy tuning. These parameters include:
    - `alpha`: The learning rate
    - `gamma`: The discount factor
    - `epsilon`: The exploration rate
    - `epsilon_min`: The minimum exploration rate
    - `epsilon_decay`: The decay rate for `epsilon`
    - `GAME_BOARD_GRID_SIZE`: The size of the game board grid
    - `reward_for_winning_episode`: The reward for winning an episode
    - `punishment_for_losing_episode`: The punishment for losing an episode
    - `reward_for_hitting_ball`: The reward for hitting the ball

## Running the Tests

After refactoring, the game was set up to run over a thousand tests, each with 2500 episodes. The tests were designed to explore a wide range of hyperparameters:

- **Alpha**: Learning rate, affecting how quickly the agent adapts to new information.
- **Gamma**: Discount factor, influencing how much future rewards are considered.
- **Epsilon**: Exploration rate, determining the likelihood of taking a random action.
- **Epsilon Min**: The minimum value that `epsilon` can decay to.
- **Epsilon Decay**: The rate at which `epsilon` decays over time.
- **Game Board Grid Size**: Affects the complexity of the state space.
- **Reward for Winning Episode**: Encourages the agent to win.
- **Punishment for Losing Episode**: Discourages the agent from losing.
- **Reward for Hitting Ball**: Encourages the agent to hit the ball.

The results of each test run were stored in a Python dictionary and then serialized to a pickle file (`all_results.pkl`) for later analysis.

## Data Analysis Plan

### Steps Involved:

1. **Load Data**: Import the `all_results.pkl` file into a Pandas DataFrame.
2. **Data Cleaning**: Remove any missing values and outliers, and convert columns to appropriate data types.
3. **Exploratory Data Analysis (EDA)**: Use statistical and visual methods to understand the data's underlying structure.
4. **Performance Metrics**: Evaluate the performance of different parameter sets based on metrics like average reward, episodes to convergence, etc.

### Expected Insights:

- **Optimal Parameters**: Identify the set of parameters that yield the best performance.
- **Parameter Sensitivity**: Understand how sensitive the model's performance is to changes in individual parameters.
- **Convergence Behavior**: Analyze how quickly the agent learns optimal policies under different parameter settings.
- **Reward Dynamics**: Examine how different reward structures affect the agent's learning process.

By the end of this analysis, I expect to have a comprehensive understanding of how different hyperparameters affect the learning process and performance of the reinforcement learning agent in the Pong game.


# Notes

## Implementing Game Mechanics for Pong

### 1. Initialize Pygame and Create Window
- Initialized Pygame and created an 800x600 window for the game.

### 2. Initialize Paddle and Ball Attributes
- Defined the dimensions of the paddles and the ball. Initialized their starting positions.

### 3. Paddle Movement
- Implemented keyboard controls for moving the paddles up and down.

### 4. Ball Movement and Collision Detection
- Added logic for ball movement and collision detection with the walls and paddles.

### 5. Ball Reset and Scoring
- Implemented ball reset and scoring mechanics. The ball resets to the center after a point is scored.

### 6. Paddle Boundaries
- Added boundaries to prevent the paddles from moving out of the window.

### 7. Game Over Conditions
- Implemented immediate feedback game-over conditions. The game resets after each point, serving as an episode in RL terms.


## Defining RL Elements for Pong

### 1. State Representation
- Decide how to represent the state of the game. Consider the trade-offs between granularity and computational complexity.

### 2. Action Space
- Define the set of actions I can take (e.g., move paddle up, move paddle down, stay still).

### 3. Reward Structure
- Design the rewards I receive for various outcomes (e.g., +1 for scoring, -1 for opponent scoring).

### 4. Policy Initialization
- Initialize my policy, which could be a Q-table, a neural network, or some other function mapping states to actions.

### 5. Learning Algorithm
- Choose and implement a learning algorithm (e.g., Q-learning, SARSA, Deep Q-Networks) to update my policy based on experiences.

### 6. Exploration-Exploitation Strategy
- Decide on a strategy for balancing exploration (trying new actions) and exploitation (sticking with known good actions), such as ε-greedy.

### 7. Training Loop
- Implement the training loop where I interact with the environment, update my policy, and optionally log metrics like average reward over time.

### 8. Evaluation Metrics
- Define metrics to evaluate my performance (e.g., average reward, win rate).

### 9. Hyperparameter Tuning
- Experiment with different learning rates, discount factors, and other hyperparameters to optimize performance.

### 10. Testing and Validation
- Test the trained agent to see how well it performs and validate that it is learning effectively.


## Q-Learning Algorithm

Q-Learning is a model-free reinforcement learning algorithm that aims to learn a policy, which tells an agent what action to take under what circumstances. It defines a function \( Q(s, a) \), representing the quality or the utility of taking action \( a \) in state \( s \).

### Outline

1. **Initialize Q-Table**: Create a table to store the Q-values for each state-action pair.
2. **Policy**: Define how the agent chooses an action (e.g., \(\epsilon\)-greedy).
3. **Learning**: Update the Q-values using the Q-Learning update rule.
4. **Training Loop**: Incorporate these elements into the game loop.

The Q-table will be represented as a Python dictionary. The keys will be the states, and the values will be another dictionary mapping actions to Q-values.


## max() reference

| Iterable Type | What It Returns to `max()` | Example of Using `max()` |
|---------------|----------------------------|--------------------------|
| List          | Individual list elements   | `max([1, 2, 3])` returns `3` |
| Tuple         | Individual tuple elements  | `max((1, 2, 3))` returns `3` |
| String        | Individual characters     | `max("abc")` returns `'c'` |
| Set           | Individual set elements    | `max({1, 2, 3})` returns `3` |
| Dictionary    | Dictionary keys           | `max({'a': 1, 'b': 2}, key=lambda k: k)` returns `'b'` |
|               |                            | `max({'a': 1, 'b': 2}.values())` returns `2` |
|               |                            | `max({'a': 1, 'b': 2}, key=lambda k: {'a': 1, 'b': 2}[k])` returns `'b'` |
| Numpy Array   | Individual array elements  | `import numpy as np; max(np.array([1, 2, 3]))` returns `3` |


## Building intuition around training variables

1. **Alpha (α) - Learning Rate**: 
    - **What it does**: Determines how much of the new Q-value estimate I adopt.
    - **Intuition**: Think of it as a "blending factor." If α is 1, I consider only the most recent information. If α is 0, I learn nothing and stick to my prior knowledge. A value between 0 and 1 blends the old and new information.
    - **Example**: If α is high (closer to 1), I will rapidly adapt to new strategies but may also forget useful past knowledge quickly.

2. **Gamma (γ) - Discount Factor**: 
    - **What it does**: Influences how much future rewards contribute to the Q-value.
    - **Intuition**: It's like a "patience meter." A high γ makes me prioritize long-term reward over short-term reward.
    - **Example**: If γ is close to 1, I will consider future rewards with greater weight, making me more strategic but potentially slower to train.

3. **Epsilon (ε) - Exploration Rate**: 
    - **What it does**: Controls the trade-off between exploration (trying new actions) and exploitation (sticking with known actions).
    - **Intuition**: It's like the "curiosity level." A high ε encourages me to try new things, while a low ε makes me stick to what I know.
    - **Example**: If ε starts high and decays over time (ε-decay), I will initially explore a lot and gradually shift to exploiting my learned knowledge.
