In [None]:
%pip install highway-env
%pip install gymnasium matplotlib numpy tqdm
%pip install highway-env

In [None]:
import gymnasium as gym
from highway_env.envs import HighwayEnv
from gymnasium.wrappers import TimeLimit
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

# ========== Environment Setup ==========
config = {
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 5,
        "features": ["x", "y", "vx", "vy"],
        "normalize": True,
    },
    "policy_frequency": 15,
    "duration": 40
}

base_env = HighwayEnv()
base_env.configure(config)
env = TimeLimit(base_env, max_episode_steps=40)

In [None]:
# ========== State Discretization ==========
POSITION_BINS = np.linspace(0, 1, 6)
SPEED_BINS = np.linspace(0, 1, 5)
LANE_BINS = np.linspace(0, 1, 3)

def discretize(observation):
    ego = observation[0]  # Focus on ego vehicle
    x, y, vx, vy = ego
    x_bin = np.digitize(x, POSITION_BINS)
    vx_bin = np.digitize(vx, SPEED_BINS)
    lane_bin = np.digitize(y, LANE_BINS)
    return (x_bin, vx_bin, lane_bin)

In [None]:
# ========== QLearningAgent Class ==========
class QLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.05):
        self.Q = {}
        self.actions = actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def get(self, state):
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.actions))
        return self.Q[state]

    def act(self, state):
        if random.random() < self.epsilon:
            return random.choice(self.actions)
        return np.argmax(self.get(state))

    def update(self, state, action, reward, next_state, done):
        max_next = 0 if done else np.max(self.get(next_state))
        self.get(state)[action] += self.alpha * (reward + self.gamma * max_next - self.get(state)[action])

    def decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

In [None]:
# ========== Training with Reward Shaping ==========
def train_q_learning(episodes, alpha, gamma):
    agent = QLearningAgent(actions=list(range(env.action_space.n)), alpha=alpha, gamma=gamma)
    rewards = []

    for ep in range(episodes):
        obs, _ = env.reset()
        state = discretize(obs)
        total_reward = 0
        done = False

        while not done:
            action = agent.act(state)
            next_obs, reward, done, truncated, _ = env.step(action)
            next_state = discretize(next_obs)

            # Reward shaping
            ego = next_obs[0]
            vx = ego[2]
            lane_y = ego[1]
            speed_reward = vx * 2.0
            lane_centering_reward = -abs(lane_y - 0.5)
            edge_penalty = -2.0 if lane_y < 0.2 or lane_y > 0.8 else 0
            slow_penalty = -1.0 if vx < 0.2 else 0

            shaped_reward = reward + speed_reward + lane_centering_reward + edge_penalty + slow_penalty

            agent.update(state, action, shaped_reward, next_state, done)
            state = next_state
            total_reward += shaped_reward

        agent.decay()
        rewards.append(total_reward)

    return rewards


In [None]:
# ========== Run Experiments Over α and γ ==========
alphas = [0.1, 0.5]
gammas = [0.9, 0.99]
episodes = 300
results = {}

for alpha in alphas:
    for gamma in gammas:
        key = f"α={alpha}, γ={gamma}"
        print(f"Training {key}")
        rewards = train_q_learning(episodes, alpha, gamma)
        results[key] = rewards

In [None]:
# ========== Plot Results ==========
plt.figure(figsize=(12, 6))
for label, rewards in results.items():
    smoothed = np.convolve(rewards, np.ones(10)/10, mode='valid')
    plt.plot(smoothed, label=label)

plt.title("Q-Learning Performance with Reward Shaping (α and γ)")
plt.xlabel("Episode")
plt.ylabel("Smoothed Total Reward")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()