<a href="https://colab.research.google.com/github/kumasura/ANOR/blob/main/FlightRerouting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces

In [2]:
class FlightReroutingEnv(gym.Env):

    """Environment using schedule data with simple simulated disruptions."""
    def __init__(self, schedule_path, disruption_prob=0.3):

        super().__init__()
        self.schedule = pd.read_excel(schedule_path)
        self.num_flights = len(self.schedule)
        self.disruption_prob = disruption_prob


        # Observation consists of: flight index, fuel level, weather, traffic,
        # alternate airports, other aircraft proximity
        self.obs_bins = np.array([
            self.num_flights + 1,  # flight index including terminal state
            5,  # fuel level bins
            5,  # weather bins
            5,  # traffic bins
            4,  # number of alternate airports
            5,  # other aircraft proximity bins
        ])
        self.observation_space = spaces.MultiDiscrete(self.obs_bins)

        # Actions follow the README: change path, swap aircraft, cancel, adjust
        # altitude, divert, wait for conditions to improve
        self.action_space = spaces.Discrete(6)

    def _random_state(self):
        return [
            np.random.randint(self.obs_bins[1]),
            np.random.randint(self.obs_bins[2]),
            np.random.randint(self.obs_bins[3]),
            np.random.randint(self.obs_bins[4]),
            np.random.randint(self.obs_bins[5]),
        ]


    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.current_idx = 0

        self.state = self._random_state()
        return tuple([self.current_idx] + self.state), {}

    def step(self, action):
        if self.current_idx >= self.num_flights:
            raise RuntimeError("Episode is done")

        disruption = np.random.rand() < self.disruption_prob
        fuel, weather, traffic, airports, other = self.state

        # Simple cost model
        fuel_cost = 10 + 5 * weather + 5 * traffic
        delay_penalty = 20 if action == 5 else 0
        swap_penalty = 50 if action == 1 else 0
        cancel_penalty = 200 if action == 2 else 0
        reroute_penalty = 30 if action in (0, 3, 4) else 0
        if not disruption and action == 5:
            delay_penalty += 20  # unnecessary waiting

        reward = -(fuel_cost + delay_penalty + swap_penalty + cancel_penalty + reroute_penalty)

        # Advance to next flight and generate new state
        self.current_idx += 1
        terminated = self.current_idx >= self.num_flights
        self.state = self._random_state() if not terminated else [0] * 5
        obs = tuple([self.num_flights] + self.state) if terminated else tuple([self.current_idx] + self.state)
        return obs, reward, terminated, False, {}

In [3]:
class QLearningAgent:
    def __init__(self, nvec, action_size, alpha=0.1, gamma=0.95, epsilon=0.1):
        state_space = int(np.prod(nvec))
        self.Q = np.zeros((state_space, action_size))
        self.nvec = nvec
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.action_size = action_size


    def _state_index(self, state):
        return np.ravel_multi_index(state, self.nvec)

    def choose_action(self, state):
        idx = self._state_index(state)
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)
        return int(np.argmax(self.Q[idx]))

    def update(self, state, action, reward, next_state):
        idx = self._state_index(state)
        next_idx = self._state_index(next_state)
        best_next = np.max(self.Q[next_idx])
        td_target = reward + self.gamma * best_next
        td_error = td_target - self.Q[idx, action]
        self.Q[idx, action] += self.alpha * td_error


In [4]:
def evaluate(env, agent):
    """Run one episode with the learned policy and print step details."""
    action_names = [
        "Change path",
        "Swap aircraft",
        "Cancel flight",
        "Adjust altitude",
        "Divert",
        "Wait",
    ]
    state, _ = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = int(np.argmax(agent.Q[agent._state_index(state)]))
        next_state, reward, terminated, _, _ = env.step(action)
        total_reward += reward
        print(
            f"Flight {state[0]} -> action: {action_names[action]} | "
            f"state: {state[1:]} | reward: {reward:.1f}"
        )
        state = next_state
        done = terminated
    print("Total reward:", total_reward)


In [5]:
def train(env, episodes=200):
    agent = QLearningAgent(env.obs_bins, env.action_space.n)
    for _ in range(episodes):

        state, _ = env.reset()
        done = False
        while not done:
            action = agent.choose_action(state)
            next_state, reward, terminated, _, _ = env.step(action)
            agent.update(state, action, reward, next_state)
            state = next_state
            done = terminated
    return agent

In [6]:
def main():
    env = FlightReroutingEnv('flight_schedule_new.xlsx')
    agent = train(env)
    print("Q-table shape:", agent.Q.shape)
    print(agent.Q[:5])
    print("\nPolicy rollout:")
    evaluate(env, agent)


In [8]:
if __name__ == '__main__':
    main()

Q-table shape: (395000, 6)
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]

Policy rollout:
Flight 0 -> action: Change path | state: (2, 3, 1, 1, 3) | reward: -60.0
Flight 1 -> action: Change path | state: (0, 4, 1, 0, 0) | reward: -65.0
Flight 2 -> action: Change path | state: (1, 2, 3, 0, 3) | reward: -65.0
Flight 3 -> action: Change path | state: (3, 0, 0, 0, 3) | reward: -40.0
Flight 4 -> action: Change path | state: (3, 4, 4, 0, 4) | reward: -80.0
Flight 5 -> action: Change path | state: (2, 2, 3, 2, 2) | reward: -65.0
Flight 6 -> action: Change path | state: (0, 2, 3, 1, 4) | reward: -65.0
Flight 7 -> action: Change path | state: (0, 0, 3, 0, 4) | reward: -55.0
Flight 8 -> action: Change path | state: (2, 4, 4, 0, 0) | reward: -80.0
Flight 9 -> action: Change path | state: (2, 3, 2, 1, 4) | reward: -65.0
Flight 10 -> action: Change path | state: (3, 1, 0, 0, 4) | reward: -45.0
Flight 11 -> action: Change path | state: (2, 

In [9]:
class DQN:
    def __init__(self, state_bins, action_size, hidden_size=64, lr=0.01, gamma=0.95):
        self.state_bins = state_bins
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        input_size = len(state_bins)
        self.w1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros(hidden_size)
        self.w2 = np.random.randn(hidden_size, action_size) * 0.01
        self.b2 = np.zeros(action_size)

    def _forward(self, x):
        z1 = x @ self.w1 + self.b1
        h1 = np.maximum(z1, 0)
        q = h1 @ self.w2 + self.b2
        return q, h1, z1

    def predict(self, x):
        q, _, _ = self._forward(x)
        return q

    def update(self, batch):
        for state, action, reward, next_state, done in batch:
            q, h1, z1 = self._forward(state)
            target = reward
            if not done:
                next_q = self.predict(next_state)
                target += self.gamma * np.max(next_q)
            dq = np.zeros_like(q)
            dq[action] = q[action] - target
            grad_w2 = np.outer(h1, dq)
            grad_b2 = dq
            grad_h1 = dq @ self.w2.T
            grad_z1 = grad_h1 * (z1 > 0)
            grad_w1 = np.outer(state, grad_z1)
            grad_b1 = grad_z1
            self.w2 -= self.lr * grad_w2
            self.b2 -= self.lr * grad_b2
            self.w1 -= self.lr * grad_w1
            self.b1 -= self.lr * grad_b1


def normalize(state, bins):
    return np.array(state) / (bins - 1)


def train_dqn(env, episodes=200, batch_size=32, buffer_limit=10000, epsilon=1.0, epsilon_decay=0.995):
    dqn = DQN(env.obs_bins, env.action_space.n)
    replay = []
    for ep in range(episodes):
        state, _ = env.reset()
        state = normalize(state, env.obs_bins)
        done = False
        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                q = dqn.predict(state)
                action = int(np.argmax(q))
            next_state, reward, terminated, _, _ = env.step(action)
            next_state_n = normalize(next_state, env.obs_bins)
            replay.append((state, action, reward, next_state_n, terminated))
            if len(replay) > buffer_limit:
                replay.pop(0)
            if len(replay) >= batch_size:
                batch_idx = np.random.choice(len(replay), batch_size, replace=False)
                batch = [replay[i] for i in batch_idx]
                dqn.update(batch)
            state = next_state_n
            done = terminated
        if epsilon > 0.1:
            epsilon *= epsilon_decay
    return dqn


def evaluate_dqn(env, dqn):
    action_names = [
        "Change path",
        "Swap aircraft",
        "Cancel flight",
        "Adjust altitude",
        "Divert",
        "Wait",
    ]
    state, _ = env.reset()
    state_n = normalize(state, env.obs_bins)
    done = False
    total_reward = 0
    while not done:
        action = int(np.argmax(dqn.predict(state_n)))
        next_state, reward, terminated, _, _ = env.step(action)
        print(
            f"Flight {state[0]} -> action: {action_names[action]} | state: {state[1:]} | reward: {reward:.1f}"
        )
        total_reward += reward
        state = next_state
        state_n = normalize(state, env.obs_bins)
        done = terminated
    print("Total reward:", total_reward)


if __name__ == "__main__":
    env = FlightReroutingEnv("flight_schedule_new.xlsx")
    agent = train_dqn(env, episodes=100)
    print("\nPolicy rollout:\n")
    evaluate_dqn(env, agent)


Policy rollout:

Flight 0 -> action: Adjust altitude | state: (4, 1, 2, 2, 3) | reward: -55.0
Flight 1 -> action: Adjust altitude | state: (3, 4, 2, 1, 2) | reward: -70.0
Flight 2 -> action: Adjust altitude | state: (3, 4, 1, 3, 4) | reward: -65.0
Flight 3 -> action: Adjust altitude | state: (2, 1, 2, 1, 2) | reward: -55.0
Flight 4 -> action: Adjust altitude | state: (1, 4, 1, 3, 0) | reward: -65.0
Flight 5 -> action: Adjust altitude | state: (1, 1, 2, 2, 2) | reward: -55.0
Flight 6 -> action: Adjust altitude | state: (2, 1, 0, 2, 0) | reward: -45.0
Flight 7 -> action: Adjust altitude | state: (2, 1, 4, 2, 1) | reward: -65.0
Flight 8 -> action: Adjust altitude | state: (2, 4, 1, 3, 2) | reward: -65.0
Flight 9 -> action: Adjust altitude | state: (4, 4, 4, 3, 1) | reward: -80.0
Flight 10 -> action: Adjust altitude | state: (4, 2, 0, 3, 3) | reward: -50.0
Flight 11 -> action: Adjust altitude | state: (0, 0, 4, 1, 3) | reward: -60.0
Flight 12 -> action: Adjust altitude | state: (4, 2, 1, 

In [10]:
def generate_schedule(env, agent, epsilon=0.05):
    """Generate a schedule (action for each flight) using an epsilon-greedy policy."""
    state, _ = env.reset()
    done = False
    actions = []
    total_reward = 0.0
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = agent.choose_action(state)
        next_state, reward, terminated, _, _ = env.step(action)
        actions.append(action)
        total_reward += reward
        state = next_state
        done = terminated
    return actions, total_reward


def column_generation(schedule_path, iterations=20):
    """Run column generation enhanced by an RL agent."""
    env = FlightReroutingEnv(schedule_path)
    agent = train(env)

    columns = []
    best_actions = None
    best_cost = float("inf")

    for _ in range(iterations):
        actions, reward = generate_schedule(env, agent)
        cost = -reward
        columns.append((actions, cost))
        if cost < best_cost:
            best_cost = cost
            best_actions = actions

    return columns, best_actions, best_cost


if __name__ == "__main__":
    cols, best_sched, best_cost = column_generation("flight_schedule_new.xlsx")
    print("Generated columns:", len(cols))
    print("Best schedule cost:", best_cost)
    print("Best schedule actions:", best_sched)

Generated columns: 20
Best schedule cost: 9760.0
Best schedule actions: [0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, np.int64(0), 0, np.int64(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(0), 0, 1, 0, 4, np.int64(1), 0, 0, 1, 3, 0, 1, 0, 0, 0, 1, 0, np.int64(0), np.int64(5), 0, 0, 0, 1, 1, 0, 0, np.int64(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(3), 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 4, np.int64(3), 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 1, 0, np.int64(5), 0, 0, 0, 0, 0, np.int64(3)]


In [12]:
def generate_schedule(env, dqn, epsilon=0.05):
    """Generate a schedule using an epsilon-greedy policy with the DQN."""
    state, _ = env.reset()
    state_n = normalize(state, env.obs_bins)
    done = False
    actions = []
    total_reward = 0.0
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(dqn.predict(state_n)))
        next_state, reward, terminated, _, _ = env.step(action)
        actions.append(action)
        total_reward += reward
        state = next_state
        state_n = normalize(state, env.obs_bins)
        done = terminated
    return actions, total_reward


def column_generation(schedule_path, iterations=20):
    """Run column generation enhanced with a DQN agent."""
    env = FlightReroutingEnv(schedule_path)
    dqn = train_dqn(env)

    columns = []
    best_actions = None
    best_cost = float("inf")

    for _ in range(iterations):
        actions, reward = generate_schedule(env, dqn)
        cost = -reward
        columns.append((actions, cost))
        if cost < best_cost:
            best_cost = cost
            best_actions = actions

    return columns, best_actions, best_cost


if __name__ == "__main__":
    cols, best_sched, best_cost = column_generation("flight_schedule_new.xlsx")
    print("Generated columns:", len(cols))
    print("Best schedule cost:", best_cost)
    print("Best schedule actions:", best_sched)

Generated columns: 20
Best schedule cost: 9320.0
Best schedule actions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(3), 0, np.int64(1), 0, 0, 0, 0, 0, 0, np.int64(5), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(1), 0, 0, 0, 0, 0, np.int64(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.int64(4), 0, 0, 0, 0, 0]
