In [None]:
from collections import deque
import numpy as np
import pandas as pd
import random


class SequentialOrganAllocationMDP:
    def __init__(self, initial_state, available_organs):
        """
        Initialize the sequential MDP for organ allocation.

        Parameters:
        - initial_state: List of dictionaries representing waitlist candidates.
        - available_organs: Dictionary with available organs by blood type.
        """
        self.initial_state = (
            tuple((r['id'], r['age'], r['MELD'], r['blood_type'], r['allocated']) for r in initial_state),
            tuple(available_organs.items())
        )
        self.value_table = {self.initial_state: 0}  # Value function initialized for the initial state
        self.policy = {self.initial_state: None}  # Policy initialized for the initial state
        self.deltas = []  # To store deltas over iterations

    def calculate_reward(self, recipient):
        """
        Calculate reward for successfully allocating an organ.

        Parameters:
        - recipient: Tuple representing the recipient receiving the organ.

        Returns:
        - Reward value based on MELD score and age.
        """
        _, age, meld, _, _ = recipient
        return 10 + (1 / (1 + max(meld,0))) * 10 - (age * 0.1) #this is giving inf rewards sometimes

    def generate_next_state(self, state, action):
        """
        Generate the next state based on the current state and action.

        Parameters:
        - state: Tuple (recipients, available_organs).
        - action: Tuple (recipient_id, organ_type).

        Returns:
        - next_state: Updated state after taking the action.
        - reward: Associated reward for the action.
        """
        recipients, available_organs = state
        recipients = list(recipients)
        available_organs = dict(available_organs)

        if action is None:
            return (tuple(recipients), tuple(available_organs.items())), 0

        recipient_id, organ_type = action
        recipient_idx = next((i for i, r in enumerate(recipients) if r[0] == recipient_id), None)

        if recipient_idx is not None and available_organs[organ_type] > 0:
            if random.random() < 0.8:  # 90% success
                recipient = recipients[recipient_idx]
                recipients[recipient_idx] = (recipient[0], recipient[1], recipient[2], recipient[3], 1)
                available_organs[organ_type] -= 1
                reward = self.calculate_reward(recipient)
                return (tuple(recipients), tuple(available_organs.items())), reward
            else:  # 10% failure (recipient dies, organ is removed)
                recipient = recipients.pop(recipient_idx)
                available_organs[organ_type] -= 1
                reward = -100  # Large penalty for death
                return (tuple(recipients), tuple(available_organs.items())), reward

        return (tuple(recipients), tuple(available_organs.items())), 0
    '''
    def value_iteration(self, gamma=0.9, epsilon=0.01):
        """
        Perform value iteration for all reachable states starting from the initial state.

        Parameters:
        - gamma: Discount factor (default 0.9).
        - epsilon: Convergence threshold (default 0.01).
        """
        states_to_explore = deque([self.initial_state])
        self.deltas = []  # Reset deltas at the start of value iteration
        iter = 0
        while True:  # Outer loop for global convergence
            delta = 0  # Track the largest change in value across all states
            iter += 1
            print(f"Iteration: {iter}")
            new_states_to_explore = deque()  # To track states added during this iteration

            
            while states_to_explore:  # Inner loop for processing current state
                current_state = states_to_explore.popleft()
                old_value = self.value_table[current_state]
                max_value = float('-inf')
                best_action = None
                no_valid_actions = True

                recipients, available_organs = current_state
                available_organs = dict(available_organs)
                for recipient in recipients:
                    if recipient[4] == 0:  # Not yet allocated
                        for organ_type in available_organs.keys():
                            if available_organs[organ_type] > 0:
                                action = (recipient[0], organ_type)
                                next_state, reward = self.generate_next_state(current_state, action)
                                value = reward + gamma * self.value_table.get(next_state, 0)
                                no_valid_actions = False
                                if np.abs(value) == float('inf') or np.abs(value) > 1e6:
                                    print(f"Large Reward: {reward}")
                                    print(f"Curr state: {current_state}, Action: {action}, Next state: {next_state}")



                                if value > max_value:
                                    max_value = value
                                    best_action = action

                                if next_state not in self.value_table:
                                    self.value_table[next_state] = 0
                                    self.policy[next_state] = None
                                    new_states_to_explore.append(next_state)
                if no_valid_actions:
                    continue
                self.value_table[current_state] = max_value
                #print(max_value)
                self.policy[current_state] = best_action
                delta = max(delta, abs(old_value - max_value))
                #delta = abs(old_value - max_value)
            print(f"Delta: delta")
            self.deltas.append(delta)  # Append delta for this iteration
            states_to_explore = new_states_to_explore  # Add newly discovered states
            #print(f"New states to explore: {len(new_states_to_explore)}")
            print(f"Total of states to explore: {len(states_to_explore)}")

            if delta < epsilon and not states_to_explore:
                break  # Stop when values converge and no new states to explore
    '''
    def value_iteration(self, gamma=0.9, epsilon=0.01, max_states=5000):
        """
        Perform value iteration with pruning.
        
        Parameters:
        - gamma: Discount factor (default 0.9).
        - epsilon: Convergence threshold (default 0.01).
        - max_states: Maximum number of states to retain in the queue during pruning.
        """
        states_to_explore = deque([self.initial_state])
        visited_states = set()  # Track visited states
        self.deltas = []  # Reset deltas at the start of value iteration
        iteration_count = 0

        while True:
            iteration_count += 1
            delta = 0
            new_states_to_explore = []

            print(f"Iteration {iteration_count}, States to explore: {len(states_to_explore)}")

            while states_to_explore:
                current_state = states_to_explore.popleft()

                # Skip already visited states
                if current_state in visited_states:
                    continue

                visited_states.add(current_state)
                old_value = self.value_table[current_state]
                max_value = float('-inf')
                best_action = None
                no_valid_actions = True

                recipients, available_organs = current_state
                available_organs = dict(available_organs)

                for recipient in recipients:
                    if recipient[4] == 0:  # Not yet allocated
                        for organ_type in available_organs.keys():
                            if available_organs[organ_type] > 0:
                                no_valid_actions = False
                                action = (recipient[0], organ_type)
                                next_state, reward = self.generate_next_state(current_state, action)
                                value = reward + gamma * self.value_table.get(next_state, 0)

                                if value > max_value:
                                    max_value = value
                                    best_action = action

                                if next_state not in self.value_table:
                                    self.value_table[next_state] = 0
                                    self.policy[next_state] = None
                                    new_states_to_explore.append((next_state, abs(old_value - max_value)))
                if no_valid_actions:
                    continue
                self.value_table[current_state] = max_value
                self.policy[current_state] = best_action
                delta = max(delta, abs(old_value - max_value))

            # Prune new states
            new_states_to_explore.sort(key=lambda x: x[1], reverse=True)  # Sort by delta (value change)
            new_states_to_explore = [state for state, _ in new_states_to_explore[:max_states]]

            print(f"Pruned to {len(new_states_to_explore)} states.")

            states_to_explore = deque(new_states_to_explore)
            self.deltas.append(delta)
            print(f"Delta: {delta}")

            if delta < epsilon and not states_to_explore:
                break

    def get_deltas(self):
        """
        Retrieve the deltas recorded during value iteration.

        Returns:
        - List of delta values for each iteration.
        """
        return self.deltas #To use for plotting later

    def simulate_with_policy(self, steps=10):
        """
        Simulate the allocation process using the computed policy.

        Parameters:
        - steps: Number of allocation steps to simulate (default 10).

        Returns:
        - Total reward, total deaths, total allocations.
        """
        current_state = self.initial_state
        total_reward = 0
        total_deaths = 0
        total_allocations = 0

        for _ in range(steps):
            action = self.policy.get(current_state, None)
            if action is None:
                break

            next_state, reward = self.generate_next_state(current_state, action)

            if reward == -100:  # Death penalty
                total_deaths += 1
            elif reward > 0:  # Successful allocation
                total_allocations += 1

            total_reward += reward
            current_state = next_state

        return total_reward, total_deaths, total_allocations


# Reinitialize the example with a reduced state space for debugging
df = pd.read_csv('waitlist_patients.csv')

initial_state = df.apply(
    lambda row: {
        'id': row.name + 1,  # Generate an 'id' starting from 1
        'age': row['RECIPIENT_AGE'],  # Replace with the actual age column if available
        'MELD': row['INIT_MELD_PELD_LAB_SCORE'],
        'blood_type': row['RECIPIENT_BLOOD_TYPE'],
        'allocated': 0  # Default value
    }, axis=1
).tolist()
available_organs = {'A': 1, 'A1': 6, 'A1B': 0, 'A2': 0, 'A2B': 2, 'AB': 0, 'B': 3, 'O': 4, 'AB': 0}
'''
initial_state = [
    {'id': i, 'age': random.randint(20, 70), 'MELD': random.randint(10, 40), 'blood_type': random.choice(['A', 'B', 'O', 'AB']), 'allocated': 0}
    for i in range(1, 6)
]
available_organs = {'A': 0, 'B': 1, 'O': 1, 'AB': 1}
'''
# Initialize and execute the MDP
mdp_model = SequentialOrganAllocationMDP(initial_state, available_organs)
mdp_model.value_iteration()
deltas = mdp_model.get_deltas()
total_reward, total_deaths, total_allocations = mdp_model.simulate_with_policy(steps=30)


total_reward, total_deaths, total_allocations


Iteration 1, States to explore: 1
Pruned to 315 states.
Delta: 18.6
Iteration 2, States to explore: 315
Pruned to 5000 states.
Delta: 18.6
Iteration 3, States to explore: 5000
Pruned to 5000 states.
Delta: 18.6
Iteration 4, States to explore: 5000


In [15]:
df['RECIPIENT_BLOOD_TYPE'].value_counts()

RECIPIENT_BLOOD_TYPE
O     27
A     20
B     11
AB     5
Name: count, dtype: int64

In [110]:
from collections import deque
import numpy as np
import pandas as pd
import random


class SequentialOrganAllocationMDP:
    def __init__(self, initial_state, available_organs):
        """
        Initialize the sequential MDP for organ allocation.

        Parameters:
        - initial_state: List of dictionaries representing waitlist candidates.
        - available_organs: Dictionary with available organs by blood type.
        """
        self.initial_state = (
            tuple((r['id'], r['age'], r['MELD'], r['blood_type'], r['allocated']) for r in initial_state),
            tuple(available_organs.items())
        )
        self.value_table = {self.initial_state: 0}  # Value function initialized for the initial state
        self.policy = {self.initial_state: None}  # Policy initialized for the initial state
        self.deltas = []  # To store deltas over iterations

    def calculate_reward(self, recipient):
        """
        Calculate reward for successfully allocating an organ.

        Parameters:
        - recipient: Tuple representing the recipient receiving the organ.

        Returns:
        - Reward value based on MELD score and age.
        """
        _, age, meld, _, _ = recipient
        return 10 + (1 / (1 + max(meld,0))) * 10 - (age * 0.1) #this is giving inf rewards sometimes

    def generate_next_state(self, state, action):
        """
        Generate the next state based on the current state and action.

        Parameters:
        - state: Tuple (recipients, available_organs).
        - action: Tuple (recipient_id, organ_type).

        Returns:
        - next_state: Updated state after taking the action.
        - reward: Associated reward for the action.
        """
        recipients, available_organs = state
        recipients = list(recipients)
        available_organs = dict(available_organs)

        if action is None:
            return (tuple(recipients), tuple(available_organs.items())), 0

        recipient_id, organ_type = action
        recipient_idx = next((i for i, r in enumerate(recipients) if r[0] == recipient_id), None)

        if recipient_idx is not None and available_organs[organ_type] > 0:
            if random.random() < 0.9:  # 90% success
                recipient = recipients[recipient_idx]
                recipients[recipient_idx] = (recipient[0], recipient[1], recipient[2], recipient[3], 1)
                available_organs[organ_type] -= 1
                reward = self.calculate_reward(recipient)
                return (tuple(recipients), tuple(available_organs.items())), reward
            else:  # 10% failure (recipient dies, organ is removed)
                recipient = recipients.pop(recipient_idx)
                available_organs[organ_type] -= 1
                reward = -100  # Large penalty for death
                return (tuple(recipients), tuple(available_organs.items())), reward

        return (tuple(recipients), tuple(available_organs.items())), 0

    def value_iteration(self, gamma=0.9, epsilon=0.01):
        """
        Perform value iteration for all reachable states starting from the initial state.

        Parameters:
        - gamma: Discount factor (default 0.9).
        - epsilon: Convergence threshold (default 0.01).
        """
        states_to_explore = deque([self.initial_state])
        self.deltas = []  # Reset deltas at the start of value iteration
        iter = 0
        while True:  # Outer loop for global convergence
            delta = 0  # Track the largest change in value across all states
            iter += 1
            print(iter)
            new_states_to_explore = deque()  # To track states added during this iteration

            while states_to_explore:  # Inner loop for processing current states
                current_state = states_to_explore.popleft()
                old_value = self.value_table[current_state]
                max_value = float('-inf')
                best_action = None
                no_valid_actions = True

                recipients, available_organs = current_state
                available_organs = dict(available_organs)
                for recipient in recipients:
                    if recipient[4] == 0:  # Not yet allocated
                        for organ_type in available_organs.keys():
                            if available_organs[organ_type] > 0:
                                action = (recipient[0], organ_type)
                                next_state, reward = self.generate_next_state(current_state, action)
                                value = reward + gamma * self.value_table.get(next_state, 0)
                                no_valid_actions = False
                                if value > max_value:
                                    max_value = value
                                    best_action = action

                                if next_state not in self.value_table:
                                    self.value_table[next_state] = 0
                                    self.policy[next_state] = None
                                    new_states_to_explore.append(next_state)
                if no_valid_actions:
                    #print(f"No valid actions for state: {current_state}")
                    continue

                self.value_table[current_state] = max_value
                #print(max_value)
                self.policy[current_state] = best_action
                delta = max(delta, abs(old_value - max_value))
                #delta = abs(old_value - max_value)

                
            print(delta)
            self.deltas.append(delta)  # Append delta for this iteration
            states_to_explore = new_states_to_explore  # Add newly discovered states

            if delta < epsilon and not states_to_explore:
                print(states_to_explore)
                break  # Stop when values converge and no new states to explore

    def get_deltas(self):
        """
        Retrieve the deltas recorded during value iteration.

        Returns:
        - List of delta values for each iteration.
        """
        return self.deltas #To use for plotting later

    def simulate_with_policy(self, steps=10):
        """
        Simulate the allocation process using the computed policy.

        Parameters:
        - steps: Number of allocation steps to simulate (default 10).

        Returns:
        - Total reward, total deaths, total allocations.
        """
        current_state = self.initial_state
        total_reward = 0
        total_deaths = 0
        total_allocations = 0

        for _ in range(steps):
            action = self.policy.get(current_state, None)
            if action is None:
                break

            next_state, reward = self.generate_next_state(current_state, action)

            if reward == -100:  # Death penalty
                total_deaths += 1
            elif reward > 0:  # Successful allocation
                total_allocations += 1

            total_reward += reward
            current_state = next_state

        return total_reward, total_deaths, total_allocations


# Reinitialize the example with a reduced state space for debugging
np.random.seed(123)
df = pd.read_csv('waitlist_patients.csv')

initial_state = df.apply(
    lambda row: {
        'id': row.name + 1,  # Generate an 'id' starting from 1
        'age': row['RECIPIENT_AGE'],  # Replace with the actual age column if available
        'MELD': row['INIT_MELD_PELD_LAB_SCORE'],
        'blood_type': row['RECIPIENT_BLOOD_TYPE'],
        'allocated': 0  # Default value
    }, axis=1
).tolist()
available_organs = {'A': 1, 'A1': 6, 'A1B': 0, 'A2': 0, 'A2B': 2, 'AB': 0, 'B': 3, 'O': 4, 'AB': 0}

# Initialize and execute the MDP
mdp_model = SequentialOrganAllocationMDP(initial_state, available_organs)
mdp_model.value_iteration()
deltas = mdp_model.get_deltas()
total_reward, total_deaths, total_allocations = mdp_model.simulate_with_policy(steps=10)

# Output the total reward, total deaths, and total allocations
print(total_reward, total_deaths, total_allocations, deltas)


1
18.6
2
18.6
3


KeyboardInterrupt: 

In [None]:
from collections import deque
import numpy as np
import pandas as pd
import random


class SequentialOrganAllocationMDP:
    def __init__(self, initial_state, available_organs):
        """
        Initialize the sequential MDP for organ allocation.

        Parameters:
        - initial_state: List of dictionaries representing waitlist candidates.
        - available_organs: Dictionary with available organs by blood type.
        """
        self.initial_state = (
            tuple((r['id'], r['age'], r['MELD'], r['blood_type'], r['allocated']) for r in initial_state),
            tuple(available_organs.items())
        )
        self.value_table = {self.initial_state: 0}  # Value function initialized for the initial state
        self.policy = {self.initial_state: None}  # Policy initialized for the initial state
        self.deltas = []  # To store deltas over iterations

    def calculate_reward(self, recipient):
        _, age, meld, _, _ = recipient
        if meld < 0 or age < 0:
            print(f"Warning: Invalid MELD or age value. MELD: {meld}, Age: {age}")
        reward = 10 + (1 / (1 + max(meld, 0))) * 10 - (age * 0.1)
        if reward > 100 or reward < -100:  # Arbitrary reward bounds
            print(f"Warning: Extreme reward value: {reward} for recipient {recipient}")
        return reward

    def generate_next_state(self, state, action):
        recipients, available_organs = state
        recipients = list(recipients)
        available_organs = dict(available_organs)

        if action is None:
            return (tuple(recipients), tuple(available_organs.items())), 0

        recipient_id, organ_type = action
        recipient_idx = next((i for i, r in enumerate(recipients) if r[0] == recipient_id), None)

        if recipient_idx is not None and available_organs[organ_type] > 0:
            if random.random() < 0.9:  # 70% success
                recipient = recipients[recipient_idx]
                recipients[recipient_idx] = (recipient[0], recipient[1], recipient[2], recipient[3], 1)
                available_organs[organ_type] -= 1
                reward = self.calculate_reward(recipient)
                print(f"Action success: Recipient {recipient_id} allocated organ {organ_type}. Reward: {reward}")
                return (tuple(recipients), tuple(available_organs.items())), reward
            else:  # 30% failure (recipient dies, organ is removed)
                recipient = recipients.pop(recipient_idx)
                available_organs[organ_type] -= 1
                reward = -100  # Large penalty for death
                print(f"Action failure: Recipient {recipient_id} died. Organ {organ_type} removed. Reward: {reward}")
                return (tuple(recipients), tuple(available_organs.items())), reward

        print(f"Invalid action: No organ {organ_type} available or recipient {recipient_id} not found.")
        return (tuple(recipients), tuple(available_organs.items())), 0


    def value_iteration(self, gamma=0.9, epsilon=0.01):
        """
        Perform value iteration for all reachable states starting from the initial state.
        """
        states_to_explore = deque([self.initial_state])
        self.deltas = []  # Reset deltas at the start of value iteration
        iteration_count = 0

        while True:  # Outer loop for global convergence
            iteration_count += 1
            delta = 0  # Track the largest change in value across all states
            new_states_to_explore = deque()  # To track states added during this iteration

            while states_to_explore:  # Inner loop for processing current states
                current_state = states_to_explore.popleft()
                old_value = self.value_table[current_state]
                max_value = float('-inf')
                best_action = None

                recipients, available_organs = current_state
                available_organs = dict(available_organs)
                no_valid_actions = True
                for recipient in recipients:
                    if recipient[4] == 0:  # Not yet allocated
                        for organ_type in available_organs.keys():
                            if available_organs[organ_type] > 0:
                                action = (recipient[0], organ_type)
                                next_state, reward = self.generate_next_state(current_state, action)
                                no_valid_actions = False
                                # Log state details during the last iterations
                                if iteration_count > 5:  # Customize as needed
                                    print(f"Iteration {iteration_count}, State: {current_state}")
                                    print(f"Action: {action}, Next State: {next_state}, Reward: {reward}")
                                
                                value = reward + gamma * self.value_table.get(next_state, 0)
                                if value > max_value:
                                    max_value = value
                                    best_action = action

                                if next_state not in self.value_table:
                                    self.value_table[next_state] = 0
                                    self.policy[next_state] = None
                                    new_states_to_explore.append(next_state)
                if no_valid_actions:
                    #print(f"No valid actions for state: {current_state}")
                    continue

                self.value_table[current_state] = max_value
                self.policy[current_state] = best_action
                delta = max(delta, abs(old_value - max_value))
            print(delta)
            self.deltas.append(delta)  # Append delta for this iteration
            states_to_explore = new_states_to_explore  # Add newly discovered states

            # Log delta during last few iterations
            if iteration_count > 5:  # Adjust the threshold for logging
                print(f"Iteration {iteration_count}, Delta: {delta}")

            if delta < epsilon and not states_to_explore:
                break  # Stop when values converge and no new states to explore


    def get_deltas(self):
        """
        Retrieve the deltas recorded during value iteration.

        Returns:
        - List of delta values for each iteration.
        """
        return self.deltas #To use for plotting later

    def simulate_with_policy(self, steps=10):
        """
        Simulate the allocation process using the computed policy.

        Parameters:
        - steps: Number of allocation steps to simulate (default 10).

        Returns:
        - Total reward, total deaths, total allocations.
        """
        current_state = self.initial_state
        total_reward = 0
        total_deaths = 0
        total_allocations = 0

        for _ in range(steps):
            action = self.policy.get(current_state, None)
            if action is None:
                break

            next_state, reward = self.generate_next_state(current_state, action)

            if reward == -100:  # Death penalty
                total_deaths += 1
            elif reward > 0:  # Successful allocation
                total_allocations += 1

            total_reward += reward
            current_state = next_state

        return total_reward, total_deaths, total_allocations


# Reinitialize the example with a reduced state space for debugging
np.random.seed(123)
initial_state = [
    {'id': i, 'age': random.randint(20, 70), 'MELD': random.randint(10, 40), 'blood_type': random.choice(['A', 'A1', 'A1B' 'A2', 'A2B', 'AB', 'B', 'O', 'AB']), 'allocated': 0}
    for i in range(1, 6)
]
df = pd.read_csv('waitlist_patients.csv')

initial_state = df.apply(
    lambda row: {
        'id': row.name + 1,  # Generate an 'id' starting from 1
        'age': row['RECIPIENT_AGE'],  # Replace with the actual age column if available
        'MELD': row['INIT_MELD_PELD_LAB_SCORE'],
        'blood_type': row['RECIPIENT_BLOOD_TYPE'],
        'allocated': 0  # Default value
    }, axis=1
).tolist()
available_organs = {'A': 1, 'A1': 6, 'A1B': 0, 'A2': 0, 'A2B': 2, 'AB': 0, 'B': 3, 'O': 4, 'AB': 0}
available_organs = {'A': 1, 'A1': 6, 'A1B': 0, 'A2': 0, 'A2B': 2, 'AB': 0, 'B': 3, 'O': 4, 'AB': 0}
#available_organs = {'A': 1, 'A1': 1, 'A1B': 2, 'A2': 0, 'A2B': 0, 'AB': 0, 'B': 0, 'O': 0, 'AB': 0}
# Initialize and execute the MDP
mdp_model = SequentialOrganAllocationMDP(initial_state, available_organs)
mdp_model.value_iteration()
deltas = mdp_model.get_deltas()
total_reward, total_deaths, total_allocations = mdp_model.simulate_with_policy(steps=10)

# Output the total reward, total deaths, and total allocations
print(total_reward, total_deaths, total_allocations)

total_reward, total_deaths, total_allocations = mdp_model.simulate_with_policy(steps=100)
print(total_reward, total_deaths, total_allocations)

In [112]:
deltas

[7.555555555555555, 7.555555555555555, 7.555555555555555, 50.0, 0]

In [None]:
def value_iteration(self, gamma=0.9, epsilon=0.01):
    """
    Perform value iteration for all reachable states starting from the initial state.
    """
    states_to_explore = deque([self.initial_state])
    self.deltas = []  # Reset deltas at the start of value iteration
    iteration_count = 0

    while True:  # Outer loop for global convergence
        iteration_count += 1
        delta = 0  # Track the largest change in value across all states
        new_states_to_explore = deque()  # To track states added during this iteration

        while states_to_explore:  # Inner loop for processing current states
            current_state = states_to_explore.popleft()
            old_value = self.value_table[current_state]
            max_value = float('-inf')
            best_action = None

            recipients, available_organs = current_state
            available_organs = dict(available_organs)
            for recipient in recipients:
                if recipient[4] == 0:  # Not yet allocated
                    for organ_type in available_organs.keys():
                        if available_organs[organ_type] > 0:
                            action = (recipient[0], organ_type)
                            next_state, reward = self.generate_next_state(current_state, action)
                            
                            # Log state details during the last iterations
                            if iteration_count > 5:  # Customize as needed
                                print(f"Iteration {iteration_count}, State: {current_state}")
                                print(f"Action: {action}, Next State: {next_state}, Reward: {reward}")
                            
                            value = reward + gamma * self.value_table.get(next_state, 0)
                            if value > max_value:
                                max_value = value
                                best_action = action

                            if next_state not in self.value_table:
                                self.value_table[next_state] = 0
                                self.policy[next_state] = None
                                new_states_to_explore.append(next_state)

            self.value_table[current_state] = max_value
            self.policy[current_state] = best_action
            delta = max(delta, abs(old_value - max_value))

        self.deltas.append(delta)  # Append delta for this iteration
        states_to_explore = new_states_to_explore  # Add newly discovered states

        # Log delta during last few iterations
        if iteration_count > 5:  # Adjust the threshold for logging
            print(f"Iteration {iteration_count}, Delta: {delta}")

        if delta < epsilon and not states_to_explore:
            break  # Stop when values converge and no new states to explore


3.1380952380952367

In [None]:
def generate_next_state(self, state, action):
    recipients, available_organs = state
    recipients = list(recipients)
    available_organs = dict(available_organs)

    if action is None:
        return (tuple(recipients), tuple(available_organs.items())), 0

    recipient_id, organ_type = action
    recipient_idx = next((i for i, r in enumerate(recipients) if r[0] == recipient_id), None)

    if recipient_idx is not None and available_organs[organ_type] > 0:
        if random.random() < 0.7:  # 70% success
            recipient = recipients[recipient_idx]
            recipients[recipient_idx] = (recipient[0], recipient[1], recipient[2], recipient[3], 1)
            available_organs[organ_type] -= 1
            reward = self.calculate_reward(recipient)
            print(f"Action success: Recipient {recipient_id} allocated organ {organ_type}. Reward: {reward}")
            return (tuple(recipients), tuple(available_organs.items())), reward
        else:  # 30% failure (recipient dies, organ is removed)
            recipient = recipients.pop(recipient_idx)
            available_organs[organ_type] -= 1
            reward = -100  # Large penalty for death
            print(f"Action failure: Recipient {recipient_id} died. Organ {organ_type} removed. Reward: {reward}")
            return (tuple(recipients), tuple(available_organs.items())), reward

    print(f"Invalid action: No organ {organ_type} available or recipient {recipient_id} not found.")
    return (tuple(recipients), tuple(available_organs.items())), 0


In [27]:
df = pd.read_csv('waitlist_patients.csv')

initial_state = df.apply(
    lambda row: {
        'id': row.name + 1,  # Generate an 'id' starting from 1
        'age': row['RECIPIENT_AGE'],  # Replace with the actual age column if available
        'MELD': row['INIT_MELD_PELD_LAB_SCORE'],
        'blood_type': row['RECIPIENT_BLOOD_TYPE'],
        'allocated': 0  # Default value
    }, axis=1
).tolist()

In [None]:
import pandas as pd


# Convert DataFrame to the desired format
initial_state = df.apply(
    lambda row: {
        'id': row.name + 1,  # Generate an 'id' starting from 1
        'age': row['RECIPIENT_AGE'],  # Replace with the actual age column if available
        'MELD': row['INIT_MELD_PELD_LAB_SCORE'],
        'blood_type': row['RECIPIENT_BLOOD_TYPE'],
        'allocated': 0  # Default value
    }, axis=1
).tolist()

result_list


[{'id': 1, 'age': 62, 'MELD': 36, 'blood_type': 'A', 'allocated': 0},
 {'id': 2, 'age': 61, 'MELD': 16, 'blood_type': 'O', 'allocated': 0},
 {'id': 3, 'age': 46, 'MELD': 16, 'blood_type': 'B', 'allocated': 0},
 {'id': 4, 'age': 57, 'MELD': 18, 'blood_type': 'B', 'allocated': 0},
 {'id': 5, 'age': 46, 'MELD': 21, 'blood_type': 'A', 'allocated': 0},
 {'id': 6, 'age': 59, 'MELD': 14, 'blood_type': 'A', 'allocated': 0},
 {'id': 7, 'age': 67, 'MELD': 47, 'blood_type': 'B', 'allocated': 0},
 {'id': 8, 'age': 68, 'MELD': 19, 'blood_type': 'O', 'allocated': 0},
 {'id': 9, 'age': 52, 'MELD': 28, 'blood_type': 'O', 'allocated': 0},
 {'id': 10, 'age': 57, 'MELD': 7, 'blood_type': 'A', 'allocated': 0},
 {'id': 11, 'age': 53, 'MELD': 41, 'blood_type': 'O', 'allocated': 0},
 {'id': 12, 'age': 67, 'MELD': 13, 'blood_type': 'B', 'allocated': 0},
 {'id': 13, 'age': 14, 'MELD': -8, 'blood_type': 'O', 'allocated': 0},
 {'id': 14, 'age': 14, 'MELD': 6, 'blood_type': 'A', 'allocated': 0},
 {'id': 15, 'age'