In [1]:
import numpy as np
import random
import pickle

class RLAgent:
    def __init__(self, state_space_size, action_space_size, alpha=0.1,
                 gamma=0.9, epsilon=0.1):
        self.q_table = np.zeros((state_space_size, action_space_size))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size

    def choose_action(self, state_idx):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.action_space_size - 1)
        return np.argmax(self.q_table[state_idx])

    def update_q(self, state_idx, action_idx, reward, next_state_idx):
        predict = self.q_table[state_idx, action_idx]
        target = reward + self.gamma * np.max(self.q_table[next_state_idx])
        self.q_table[state_idx, action_idx] += self.alpha * (target - predict)

    def save(self, path='q_table.pkl'):
        with open(path, 'wb') as f:
            pickle.dump(self.q_table, f)

    def load(self, path='q_table.pkl'):
        with open(path, 'rb') as f:
            self.q_table = pickle.load(f)


In [2]:
import random

# Define states: (accuracy_level, speed_level)
states = [
    (0, 0), (0, 1), (0, 2),
    (1, 0), (1, 1), (1, 2),
    (2, 0), (2, 1), (2, 2)
]
state_to_index = {s: i for i, s in enumerate(states)}

# 5 Actions: feedback types
actions = ["text", "visual", "audio", "simplify", "increase"]

def simulate_user_response(state, action):
    """
    Simulate if user's accuracy improves based on current state and action
    Returns new state and reward
    """
    acc, speed = state

    # Logic: simplify helps if acc is low, increase helps if acc is high
    if action == "simplify" and acc == 0:
        acc += 1
        reward = 1
    elif action == "increase" and acc == 2:
        acc = max(0, acc - 1)
        reward = -1
    elif action in ["visual", "audio"] and random.random() > 0.4:
        acc = min(2, acc + 1)
        reward = 1
    else:
        reward = -1  # no improvement

    # Slight random change in speed
    speed = min(2, max(0, speed + random.choice([-1, 0, 1])))
    return (acc, speed), reward


In [4]:
from IPython.display import display

import random

agent = RLAgent(state_space_size=len(states), action_space_size=len(actions))

# Train for 1000 episodes
for episode in range(1000):
    state = random.choice(states)
    state_idx = state_to_index[state]

    action_idx = agent.choose_action(state_idx)
    action = actions[action_idx]

    new_state, reward = simulate_user_response(state, action)
    new_state_idx = state_to_index[new_state]

    agent.update_q(state_idx, action_idx, reward, new_state_idx)

    if episode % 100 == 0:
        print(f"Episode {episode} | State: {state} | Action: {action} | Reward: {reward}")

agent.save("q_table.pkl")
print("Q-table saved!")


Episode 0 | State: (2, 2) | Action: text | Reward: -1
Episode 100 | State: (1, 0) | Action: audio | Reward: 1
Episode 200 | State: (0, 0) | Action: simplify | Reward: 1
Episode 300 | State: (0, 1) | Action: visual | Reward: -1
Episode 400 | State: (2, 2) | Action: audio | Reward: -1
Episode 500 | State: (1, 0) | Action: audio | Reward: -1
Episode 600 | State: (0, 2) | Action: simplify | Reward: 1
Episode 700 | State: (2, 0) | Action: audio | Reward: 1
Episode 800 | State: (0, 2) | Action: simplify | Reward: 1
Episode 900 | State: (1, 1) | Action: visual | Reward: -1
Q-table saved!
