In [6]:
#PPo based constant form question:Similar env with reinforce

In [1]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

df = pd.read_excel('knowledge_base1.xlsx')

FEATURES = ["Hair", "Feathers", "Eggs", "Milk", "Airborne", "Aquatic", "Predator", "Toothed",
            "Backbone", "Breathes", "Venomous", "Fins", "Nlegs_0", "Nlegs_2", "Nlegs_4",
            "Nlegs_5", "Nlegs_6", "Nlegs_8", "Tail", "Domestic", "Catsize",
            "Mammal", "Bird", "Reptile", "Fish", "Amphibian", "Insect", "Invertebrate"]

animal_features = df.iloc[:, 1:].values
NUM_FEATURES = len(FEATURES)
print("Dataset loaded. Number of features:", NUM_FEATURES)

Dataset loaded. Number of features: 28


In [2]:
class Animal20QEnv:
    def __init__(self, animal_features, max_steps=20):
        self.animal_features = animal_features
        self.num_animals = len(animal_features)
        self.num_features = animal_features.shape[1]
        self.max_steps = max_steps

    def reset(self):
        self.target_idx = random.randint(0, self.num_animals - 1)
        self.remaining_animals = list(range(self.num_animals))
        self.asked_features = set()
        self.steps = 0
        self.state = np.zeros(self.num_features, dtype=np.float32)
        return self.state

    def step(self, feature_idx):
        previous_remaining = len(self.remaining_animals)
        answer = self.animal_features[self.target_idx][feature_idx]
        self.remaining_animals = [idx for idx in self.remaining_animals if self.animal_features[idx][feature_idx] == answer]
        self.state[feature_idx] = answer
        self.steps += 1
        done = self.steps >= self.max_steps

        eliminated = previous_remaining - len(self.remaining_animals)
        reward = eliminated
        if eliminated >= 10:
            reward += 10
        if feature_idx in self.asked_features:
            reward -= 5
        else:
            self.asked_features.add(feature_idx)

        if done:
            guessed_idx = self.guess_animal()
            reward += 50 if guessed_idx == self.target_idx else -10

        return self.state.copy(), reward, done

    def guess_animal(self):
        if np.all(self.state == 0):
            return random.choice(self.remaining_animals)
        match_scores = self.animal_features @ self.state
        return np.argmax(match_scores)

In [3]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU())
        self.actor = nn.Linear(128, action_dim)
        self.critic = nn.Linear(128, 1)

    def forward(self, x):
        shared = self.shared(x)
        return self.actor(shared), self.critic(shared)

class PPO:
    def __init__(self, state_dim, action_dim):
        self.model = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.gamma = 0.99
        self.eps_clip = 0.2

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        logits, _ = self.model(state)
        probs = torch.softmax(logits, dim=-1)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def train_step(self, states, actions, rewards, log_probs, next_states, dones):
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        log_probs = torch.stack(log_probs)
        dones = torch.tensor(dones, dtype=torch.float32)

        _, next_values = self.model(torch.tensor(next_states, dtype=torch.float32))
        _, values = self.model(states)
        values = values.squeeze()
        next_values = next_values.squeeze()

        returns = rewards + self.gamma * next_values * (1 - dones)
        advantages = returns - values

        new_logits, _ = self.model(states)
        new_probs = torch.softmax(new_logits, dim=-1)
        new_dist = torch.distributions.Categorical(new_probs)
        new_log_probs = new_dist.log_prob(actions)

        ratio = torch.exp(new_log_probs - log_probs.detach())
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
        loss = -torch.min(surr1, surr2).mean() + (returns - values).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
env = Animal20QEnv(animal_features=animal_features)
ppo = PPO(state_dim=NUM_FEATURES, action_dim=NUM_FEATURES)

episodes = 15000
success_log = []

for ep in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    log_probs = []
    states = []
    actions = []
    rewards = []
    next_states = []
    dones = []
    
    episode_questions = []  # ✅ Store constant-form questions

    while not done:
        action, log_prob = ppo.select_action(state)
        next_state, reward, done = env.step(action)

        # ✅ Constant-form question
        feature_name = FEATURES[action]
        question = f"Does the animal have {feature_name.replace('_', ' ').lower()}?"
        episode_questions.append(question)

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        dones.append(done)
        log_probs.append(log_prob)

        state = next_state
        total_reward += reward

    ppo.train_step(states, actions, rewards, log_probs, next_states, dones)
    success = int(env.guess_animal() == env.target_idx)
    success_log.append(success)

    if (ep + 1) % 50 == 0:
        avg_success = np.mean(success_log[-50:])
        print(f"\nEpisode {ep + 1}, Average Success Rate: {avg_success:.2f}")
        print("Sample Questions:")
        for q in episode_questions[:5]:  # Show first 5
            print(" -", q)



Episode 50, Average Success Rate: 0.26
Sample Questions:
 - Does the animal have hair?
 - Does the animal have aquatic?
 - Does the animal have tail?
 - Does the animal have nlegs 8?
 - Does the animal have nlegs 6?

Episode 100, Average Success Rate: 0.18
Sample Questions:
 - Does the animal have predator?
 - Does the animal have feathers?
 - Does the animal have nlegs 4?
 - Does the animal have predator?
 - Does the animal have nlegs 4?

Episode 150, Average Success Rate: 0.20
Sample Questions:
 - Does the animal have nlegs 6?
 - Does the animal have milk?
 - Does the animal have milk?
 - Does the animal have reptile?
 - Does the animal have bird?

Episode 200, Average Success Rate: 0.10
Sample Questions:
 - Does the animal have milk?
 - Does the animal have nlegs 4?
 - Does the animal have nlegs 2?
 - Does the animal have eggs?
 - Does the animal have eggs?

Episode 250, Average Success Rate: 0.20
Sample Questions:
 - Does the animal have milk?
 - Does the animal have toothed?
 - D

In [None]:
plt.plot(np.convolve(success_log, np.ones(50)/50, mode='valid'))
plt.xlabel("Episode")
plt.ylabel("Success Rate (50-ep avg)")
plt.title("PPO Success Rate on Animal 20Q")
plt.grid(True)
plt.show()