In [None]:
#fist_try_with_knowledge_using Reinforce algorithm with torch:


In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# import pandas lib as pd
import pandas as pd

# read by default 1st sheet of an excel file
df = pd.read_excel('knowledge_base1.xlsx')

# Define Features
FEATURES = ["Hair", "Feathers", "Eggs", "Milk", "Airborne", "Aquatic", "Predator", "Toothed",
            "Backbone", "Breathes", "Venomous", "Fins", "Nlegs_0", "Nlegs_2", "Nlegs_4",
            "Nlegs_5", "Nlegs_6", "Nlegs_8", "Tail", "Domestic", "Catsize","Mammal","Bird","Reptile","Fish","Amphibian","Insect","Invertebrate"
]

NUM_ANIMALS = 100  
NUM_FEATURES = len(FEATURES)
MAX_STEPS = 20

# Generate random animal feature matrix
animal_features = df.iloc[:, 1:].values  # Excluding the animal names

# Environment Class
class AnimalQuestionEnv:
    def __init__(self):
        self.animals = list(range(NUM_ANIMALS))
        self.target_animal = random.choice(self.animals)
        self.remaining_animals = set(self.animals)
        self.state = np.ones(NUM_FEATURES)
        self.step_count = 0
        self.done = False

    def reset(self):
        self.target_animal = random.choice(self.animals)
        self.remaining_animals = set(self.animals)
        self.state = np.ones(NUM_FEATURES)
        self.step_count = 0
        self.done = False
        return self.state

    def step(self, action):
        """Handles selecting a question (feature index)."""
        if self.done:
            return self.state, 0, self.done

        feature_idx = int(action.item())  # Convert tensor to int
        feature_idx = max(0, min(NUM_FEATURES - 1, feature_idx))  # Ensure valid index

        correct_answer = animal_features[self.target_animal][feature_idx]

        # Remove animals that do not match the answer
        self.remaining_animals = {
            a for a in self.remaining_animals if animal_features[a][feature_idx] == correct_answer
        }

        # Update state
        self.state[feature_idx] = correct_answer
        self.step_count += 1

        # Reward based on elimination
        reward = len(self.animals) - len(self.remaining_animals)

        if self.step_count >= MAX_STEPS or len(self.remaining_animals) == 1:
            self.done = True
            guessed_animal = next(iter(self.remaining_animals), -1)
            reward = 30 if guessed_animal == self.target_animal else -30

        return self.state, reward, self.done

# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(NUM_FEATURES, 64)
        self.fc2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, NUM_FEATURES)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return torch.softmax(self.output_layer(x), dim=-1)

# Training Setup
env = AnimalQuestionEnv()
policy_net = PolicyNetwork()
optimizer = optim.Adam(policy_net.parameters(), lr=0.001)

# Train using REINFORCE Algorithm
gamma = 0.99
num_episodes = 1500

for episode in range(num_episodes):
    state = env.reset()
    log_probs = []
    rewards = []

    for t in range(MAX_STEPS):
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Add batch dim
        action_probs = policy_net(state_tensor)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()

        log_prob = action_dist.log_prob(action)
        log_probs.append(log_prob)

        next_state, reward, done = env.step(action)
        rewards.append(reward)

        if done:
            break
        state = next_state

    # Compute discounted rewards
    returns = []
    G = 0
    for r in reversed(rewards):#Expected rewards
        G = r + gamma * G
        returns.insert(0, G)

    returns = torch.tensor(returns, dtype=torch.float32)
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)  # Normalize

    # Compute policy loss
    policy_loss = []
    for log_prob, R in zip(log_probs, returns):
        policy_loss.append(-log_prob * R)
    
    optimizer.zero_grad()
    sum(policy_loss).backward()
    optimizer.step()

    if episode % 50 == 0:
        print(f"Episode {episode}, Remaining Animals: {len(env.remaining_animals)}, Reward: {sum(rewards)}")

print("Training complete!")


Episode 0, Remaining Animals: 5, Reward: 1425
Episode 50, Remaining Animals: 2, Reward: 1629
Episode 100, Remaining Animals: 4, Reward: 1614
Episode 150, Remaining Animals: 5, Reward: 1617
Episode 200, Remaining Animals: 2, Reward: 1735
Episode 250, Remaining Animals: 8, Reward: 1561
Episode 300, Remaining Animals: 8, Reward: 1565
Episode 350, Remaining Animals: 6, Reward: 1631
Episode 400, Remaining Animals: 8, Reward: 1305
Episode 450, Remaining Animals: 25, Reward: 1184
Episode 500, Remaining Animals: 2, Reward: 1587
Episode 550, Remaining Animals: 3, Reward: 1542
Episode 600, Remaining Animals: 6, Reward: 1465
Episode 650, Remaining Animals: 11, Reward: 1359
Episode 700, Remaining Animals: 3, Reward: 1428
Episode 750, Remaining Animals: 1, Reward: 1235
Episode 800, Remaining Animals: 7, Reward: 1617
Episode 850, Remaining Animals: 25, Reward: 1195
Episode 900, Remaining Animals: 3, Reward: 1560
Episode 950, Remaining Animals: 5, Reward: 1331
Episode 1000, Remaining Animals: 2, Rewa