<a href="https://colab.research.google.com/github/jonathanjander/Best-README-Template/blob/master/textworld.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Textworld example

## notes
- [official pytorch example](https://colab.research.google.com/github/microsoft/TextWorld/blob/main/notebooks/Building%20a%20simple%20agent.ipynb#scrollTo=ChrM9GGGlrtf)

In [1]:
!pip install textworld
#!pip install gym
#!pip install gym==0.21

Collecting textworld
  Downloading textworld-1.6.1.tar.gz (708 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.6/708.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tatsu>=5.8.3 (from textworld)
  Downloading TatSu-5.8.3-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hashids>=1.2.0 (from textworld)
  Downloading hashids-1.3.1-py2.py3-none-any.whl (6.6 kB)
Collecting jericho>=3.0.3 (from textworld)
  Downloading jericho-3.1.2.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?2

In [2]:
import numpy as np
from glob import glob
import os
import re
from typing import List, Mapping, Any, Optional
from collections import defaultdict, Counter
import numpy as np
import textworld
import textworld.gym
from textworld import EnvInfos
from time import time


import matplotlib.pyplot as plt
import random

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer

## init

In [3]:
#!tw-make custom --world-size 2 --quest-length 4 --nb-objects 10 --output tw_games/game.ulx -f -v --seed 123
!tw-make tw-simple --rewards dense  --goal detailed --seed 42 --output games/tw-rewardsDense_goalDetailed.z8 -v -f

Global seed: 42
Game generated: /content/games/tw-rewardsDense_goalDetailed.z8

Objective:
I hope you're ready to go into rooms and interact with objects, because you've just entered TextWorld! Here is how to play! First stop, open the chest drawer. After that, recover the old key from the chest drawer within the bedroom. Then, check that the wooden door is unlocked with the old key. And then, open the wooden door inside the bedroom. Then, go to the east. And then, look and see that the screen door in the kitchen is ajar. After that, make an effort to venture east. And then, attempt to move south. With that accomplished, lift the bell pepper from the floor of the garden. After that, move north. Okay, and then, attempt to take a trip west. Following that, rest the bell pepper on the stove. And once you've done that, you win!

Walkthrough:
open chest drawer > take old key from chest drawer > unlock wooden door with old key > open wooden door > go east > open screen door > go east > go so

In [4]:
request_infos = textworld.EnvInfos(
    admissible_commands=True,  # All commands relevant to the current state.
    entities=True,             # List of all interactable entities found in the game.
    facts=True,  # All the facts that are currently true about the world.
    intermediate_reward=True,
    max_score = True,
    inventory=True,
    description=True,
    command_templates = True
)

In [6]:
# Requesting additional information should be done when registering the game.
#env_id = textworld.gym.register_game('tw_games/game.ulx', request_infos)
env_id = textworld.gym.register_game("./games/tw-rewardsDense_goalDetailed.z8", request_infos)
env = textworld.gym.make(env_id)

obs, infos = env.reset()
print("Entities: {}\n".format(infos["entities"]))
print("Admissible commands:\n  {}".format("\n  ".join(infos["admissible_commands"])))
#print("command_templates:\n  {}".format("\n  ".join(infos["command_templates"])))

Entities: ['wooden door', 'screen door', 'chest drawer', 'antique trunk', 'refrigerator', 'toilet', 'bath', 'lettuce', 'bell pepper', 'apple', 'shovel', 'king-size bed', 'counter', 'set of chairs', 'stove', 'kitchen island', 'sink', 'couch', 'low table', 'tv', 'bbq', 'patio table', 'tomato plant', 'half of a bag of chips', 'milk', 'old key', 'soap bar', 'toothbrush', 'remote', 'note', 'north', 'south', 'east', 'west']

Admissible commands:
  examine antique trunk
  examine chest drawer
  examine king-size bed
  examine wooden door
  inventory
  look
  open antique trunk
  open chest drawer


In [7]:
action_space = len(infos["admissible_commands"])
state_space = 20 # number of rooms times number of items 2*10 (dont think this is correct)

### playing the game

In [None]:
try:
    done = False
    obs, _ = env.reset()
    print(obs)

    print(infos["admissible_commands"])
    nb_moves = 0
    while not done:
        command = input("> ")
        obs, score, done, infos = env.step(command)
        print(obs)
        print('Score',score)
        print(infos["admissible_commands"])
        nb_moves += 1

except KeyboardInterrupt:
    pass  # Press the stop button in the toolbar to quit the game.

print("Played {} steps, scoring {} points.".format(nb_moves, score))

### gpt-4 attempt using function approximation

In [113]:
class EPSdecay:
    def __init__(self,min_epsilon=0.01,max_epsilon=1.0,decay_rate=0.01):
        self.min_epsilon = min_epsilon
        self.max_epsilon = max_epsilon
        self.decay_rate = decay_rate
    def __call__(self,episode)->float:
        return self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay_rate*episode)

In [114]:
def build_vocabulary(descriptions):
    word_counts = Counter(word for desc in descriptions for word in desc.split())
    vocabulary = {word: i for i, word in enumerate(word_counts.keys())}
    return vocabulary

# Example descriptions from your text-based game
descriptions = [
    "You see a key and a door",
    "You are in a dark room",
    # ... more descriptions from your game
]

vocabulary = build_vocabulary(descriptions)

def preprocess_state(description, vocabulary):
    state_vector = np.zeros(len(vocabulary))
    for word in description.split():
        if word in vocabulary:
            state_vector[vocabulary[word]] += 1
    return state_vector

# Example usage
state_description = "You are in a room with a key"
state_vector = preprocess_state(state_description, vocabulary)


In [115]:
def build_model(input_size, output_size):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=(input_size,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(output_size)
    ])
    return model


In [116]:
def train_step(model, optimizer, state, action, reward, next_state, done):
    # Predict Q-values for the current state
    with tf.GradientTape() as tape:
        q_values = model(state)
        q_action = tf.reduce_sum(tf.one_hot(action, action_space) * q_values, axis=1)

        # Predict the Q-values for next state
        q_values_next = model(next_state)
        q_next = tf.reduce_max(q_values_next, axis=1)
        #bellman equation
        q_target = reward + (1 - done) * discount_factor * q_next

        # Calculate loss
        loss = tf.reduce_mean(tf.square(q_target - q_action))

    # Backpropagate the error
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))


In [117]:
discount_factor = 0.95
num_episodes = 20
learning_rate = 0.001
#epsilon = 0.5 # use fixed epsilon
epsilon = EPSdecay()

In [118]:
optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
model = build_model(len(state_vector), action_space)

In [283]:
# Collect some statistics: nb_steps, final reward.
avg_moves, avg_scores, avg_norm_scores = [], [], []
for episode in range(num_episodes):
    state, infos = env.reset()
    done = False
    total_rewards = 0
    total_steps = 0
    while not done:
      #print('episode: ', episode)

      state_vector = preprocess_state(state, vocabulary)
      state_vector = np.expand_dims(state_vector, axis=0) # Convert from shape (features,) to (1, features)

      # Select action using epsilon-greedy policy
      if random.uniform(0,1) > epsilon(episode):
        q_values = model.predict(state_vector, verbose=0)[0]
        action_ind = np.argmax(q_values)
        print(q_values)
      else:
        action_ind = random.randrange(len(infos["admissible_commands"]))
        #action = infos["admissible_commands"][action_ind] # random action for now

      action = infos["admissible_commands"][action_ind]
      #print(action)
      next_state, reward, done, infos = env.step(action)
      total_steps += 1
      total_rewards += reward

      # Preprocess states if required
      next_state_vector = preprocess_state(next_state, vocabulary)
      next_state_vector = np.expand_dims(next_state_vector, axis=0) # Convert from shape (features,) to (1, features)

      #print(next_state_vector)
      # Perform training step
      train_step(model, optimizer, state_vector, action_ind, reward, next_state_vector, done)

      state = next_state

      avg_moves.append(total_steps)
      avg_scores.append(reward)
      avg_norm_scores.append(reward / infos["max_score"])
    #print('total steps', total_steps)
    #print('total rewards', total_rewards)
    print('avg moves', np.mean(avg_moves))
    print('avg score', "{:.2f}".format(np.mean(avg_scores)))
    print('avg normalized score', "{:.2f}".format(np.mean(avg_norm_scores)))


avg moves 20.5
avg score 0.03
avg normalized score 0.03
avg moves 23.27777777777778
avg score 0.01
avg normalized score 0.01


KeyboardInterrupt: ignored

# trying to implement the example code to tensorflow (my implementation)
- using a GRU
- using embedding

In [6]:
# building a random baseline
class RandomAgent(textworld.gym.Agent):
    """ Agent that randomly selects a command from the admissible ones. """
    def __init__(self, seed=1234):
        self.seed = seed
        self.rng = np.random.RandomState(self.seed)

    @property
    def infos_to_request(self) -> textworld.EnvInfos:
        return textworld.EnvInfos(admissible_commands=True)

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> str:
        return self.rng.choice(infos["admissible_commands"])

In [5]:
# play function
def play(agent, path, max_step=50, nb_episodes=10, verbose=True):
    # torch.manual_seed(20211021)  # For reproducibility when using action sampling.

    infos_to_request = agent.infos_to_request
    infos_to_request.max_score = True  # Needed to normalize the scores.

    gamefiles = [path]
    if os.path.isdir(path):
        gamefiles = glob(os.path.join(path, "*.z8"))

    env_id = textworld.gym.register_games(gamefiles,
                                          request_infos=infos_to_request,
                                          max_episode_steps=max_step)
    env = textworld.gym.make(env_id)  # Create a Gym environment to play the text game.
    if verbose:
        if os.path.isdir(path):
            print(os.path.dirname(path), end="")
        else:
            print(os.path.basename(path), end="")

    # Collect some statistics: nb_steps, final reward.
    avg_moves, avg_scores, avg_norm_scores = [], [], []
    for no_episode in range(nb_episodes):
        obs, infos = env.reset()  # Start new episode.

        score = 0
        done = False
        nb_moves = 0
        while not done:
            command = agent.act(obs, score, done, infos) # CHANGE FOR TF
            #command = agent.act(obs, done, infos)
            obs, score, done, infos = env.step(command)
            nb_moves += 1

        agent.act(obs, score, done, infos)  # Let the agent know the game is done. CHANGE FOR TF
        # agent.act(obs, done, infos)  # Let the agent know the game is done. CHANGE FOR TF

        if verbose:
            print(".", end="")
        avg_moves.append(nb_moves)
        avg_scores.append(score)
        avg_norm_scores.append(score / infos["max_score"])

    env.close()
    if verbose:
        if os.path.isdir(path):
            msg = "  \tavg. steps: {:5.1f}; avg. normalized score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_norm_scores), 1))
        else:
            msg = "  \tavg. steps: {:5.1f}; avg. score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_scores), infos["max_score"]))

In [163]:
#agent = RandomAgent()
#play(agent, 'tw_games/game.ulx')

NameError: ignored

In [23]:
# TENSORFLOW
# TODO IMPLEMENT THE CRITIC
# TODO INCREASE PERFORMANCE
class CommandScorer(models.Model):
    def __init__(self, input_size, hidden_size):
        super(CommandScorer, self).__init__()
        self.embedding = layers.Embedding(input_size, hidden_size)
        self.encoder_gru = layers.GRU(hidden_size, return_sequences=True, return_state=True)
        self.cmd_encoder_gru = layers.GRU(hidden_size, return_state=True)
        self.state_gru = layers.GRU(hidden_size, return_state=True)
        self.critic = layers.Dense(1) # Critic for state value estimation
        self.att_cmd = layers.Dense(1) # Attention mechanism for commands

    def call(self, obs, commands):
        # Process observation
        embedded_obs = self.embedding(obs)
        _, encoder_hidden = self.encoder_gru(embedded_obs)

        # Expand dimensions of encoder_hidden to fit GRU input requirements
        # for critic state value prediction
        encoder_hidden_expanded = tf.expand_dims(encoder_hidden, axis=1)
        state_output, _ = self.state_gru(encoder_hidden_expanded)
        value = self.critic(state_output)

        # Process commands
        cmds_embedding = self.embedding(commands)  # Shape: (num_commands, cmd_length, hidden_size)
        cmd_length = cmds_embedding.shape[1]

        # Reshape for batch processing
        cmds_embedding_reshaped = tf.reshape(cmds_embedding, (-1, cmd_length, self.embedding.output_dim))
        batch_size = tf.shape(cmds_embedding_reshaped)[0]

        # Process each command as a separate sequence
        _, cmds_encoding_last_states = self.cmd_encoder_gru(cmds_embedding_reshaped, initial_state=tf.zeros((batch_size, self.encoder_gru.units)))

        # Reshape to get separate encodings for each command
        cmds_encoding_last_states = tf.reshape(cmds_encoding_last_states, (1, -1, self.encoder_gru.units))

        # Prepare state representation
        state_hidden_repeated = tf.repeat(tf.expand_dims(encoder_hidden, axis=1), repeats=tf.shape(commands)[0], axis=1)

        # Concatenate state and command encodings
        cmd_selector_input = tf.concat([state_hidden_repeated, cmds_encoding_last_states], axis=-1)

        # Compute scores and select action
        scores = self.att_cmd(cmd_selector_input)
        scores = tf.squeeze(scores, axis=-1)

        # Calculate probabilities and sample an action
        probs = tf.nn.softmax(scores, axis=1)
        index = tf.random.categorical(tf.math.log(probs), num_samples=1)

        return scores, index, value



    def call_old(self, obs, commands):
        embedded = self.embedding(obs)
        _, encoder_hidden = self.encoder_gru(embedded)

        cmds_embedding = self.embedding(commands) # (8, 3, 128)
        cmds_encoding_last_states_list = []

        # Process each command independently
        for i in range(cmds_embedding.shape[0]):  # Iterate over each command
            # Treat each command as a single time-step sequence
            cmd = tf.expand_dims(cmds_embedding[i], axis=0)  # Shape: (1, 3, 128)
            _, cmd_encoding = self.cmd_encoder_gru(cmd)  # Shape: (1, 128)
            cmd_encoding = tf.reshape(cmd_encoding, (1, 1, -1))
            cmds_encoding_last_states_list.append(cmd_encoding)


        cmds_encoding_last_states = tf.concat(cmds_encoding_last_states_list, axis=1)  # Shape: (1, 8, 128)

        state_hidden_repeated = tf.repeat(tf.expand_dims(encoder_hidden, axis=1), repeats=tf.shape(commands)[0] , axis=1)  # Shape: (1, 8, 128)

        cmd_selector_input = tf.concat([state_hidden_repeated, cmds_encoding_last_states], axis=-1)  # Shape: (1, 1, 8, 256)


        scores = self.att_cmd(cmd_selector_input) # DYNAMIC ATTEMPT
        scores = tf.squeeze(scores, axis=-1)

        # Calculate probabilities and sample an action
        probs = tf.nn.softmax(scores, axis=1)
        index = tf.random.categorical(tf.math.log(probs), num_samples=1)  # Use log probabilities for stability

        return scores, index, value


In [57]:
# TENSORFLOW

class NeuralAgent:
    # ... (Initialization and utility functions remain largely the same)
    """ Simple Neural Agent for playing TextWorld games. """
    MAX_VOCAB_SIZE = 1000
    UPDATE_FREQUENCY = 10
    LOG_FREQUENCY = 1000
    GAMMA = 0.9
    LR = 0.001

    def __init__(self) -> None:
        self.tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE, oov_token="<UNK>")
        self._initialized = False
        self._epsiode_has_started = False
        self.id2word = ["<PAD>", "<UNK>"]
        self.word2id = {w: i for i, w in enumerate(self.id2word)}

        self.model = CommandScorer(input_size=self.MAX_VOCAB_SIZE, hidden_size=128)
        #self.optimizer = optim.Adam(self.model.parameters(), 0.00003) # CHANGE FOR TF
        self.optimizer = tf.optimizers.Adam(learning_rate=self.LR)
        self.mode = "test"


    def train(self): # CHANGE FOR TF
        self.mode = "train"
        self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}
        self.transitions = []
        #self.model.reset_hidden(1)
        self.last_score = 0
        self.no_train_step = 0

    def test(self): # CHANGE FOR TF
        self.mode = "test"
        #self.model.reset_hidden(1)


    @property
    def infos_to_request(self) -> EnvInfos: # WORKING
        return EnvInfos(description=True, inventory=True, admissible_commands=True,
                        won=True, lost=True)

    def _get_word_id(self, word): # WORKING

        #print('GET WORD ID METHOD')
        if word not in self.word2id:
            if len(self.word2id) >= self.MAX_VOCAB_SIZE:
                return self.word2id["<UNK>"]

            self.id2word.append(word)
            self.word2id[word] = len(self.word2id)

        return self.word2id[word]

    def _tokenize(self, text): # WORKING

        #print('TOKENIZE METHOD')
        # Simple tokenizer: strip out all non-alphabetic characters.
        text = re.sub("[^a-zA-Z0-9\- ]", " ", text)
        word_ids = list(map(self._get_word_id, text.split()))
        return word_ids

    def _process(self, texts, tokenizer): # WORKING

        #print('PROCESS METHOD')
        #tokenized_texts = tokenizer.texts_to_sequences(texts)
        texts = list(map(self._tokenize, texts))
        max_len = max(len(l) for l in texts)
        padded = np.ones((len(texts), max_len)) * self.word2id["<PAD>"]
        #padded_texts = tf.keras.preprocessing.sequence.pad_sequences(tokenized_texts, maxlen=max_len)
        return padded

    def _discount_rewards(self, last_values): # CHANGE FOR TF
        returns, advantages = [], []
        R = last_values.data
        for t in reversed(range(len(self.transitions))):
            rewards, _, _, values = self.transitions[t]
            R = rewards + self.GAMMA * R
            adv = R - values
            returns.append(R)
            advantages.append(adv)

        return returns[::-1], advantages[::-1]

    def _train_loop(self):
      print('TRAIN_MODEL METHOD')

      if not self.transitions:
          return

      # Unpack the transitions
      rewards, actions, q_values, values = zip(*self.transitions)


      # Convert lists to TensorFlow tensors
      rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
      actions = tf.stack(actions)  # Assuming actions are already tensors
      q_values = tf.stack(q_values)
      values = tf.stack(values)

      # Compute target Q-values
      target_q_values = rewards + self.GAMMA * tf.reduce_max(q_values, axis=2)

      with tf.GradientTape() as tape:
          # Compute the loss
          # This part depends on your network's output and the specific algorithm (Q-learning, SARSA, etc.)
          # Here's a basic implementation assuming Q-learning:

          # Gather Q-values of taken actions
          action_q_values = tf.reduce_sum(tf.one_hot(actions, depth=self.num_actions) * q_values, axis=2)
          loss = tf.reduce_mean(tf.square(target_q_values - action_q_values))

      # Compute gradients
      gradients = tape.gradient(loss, self.model.trainable_variables)

      # Apply gradients
      self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

      # Clear transitions
      self.transitions.clear()

    def act(self, obs, score, done, infos):
        #print('ACT METHOD')
        # Convert observation and commands to model input format

        # Build agent's observation: feedback + look + inventory.
        input_ = "{}\n{}\n{}".format(obs, infos["description"], infos["inventory"])

        input_tensor = self._process([input_], self.tokenizer)

        commands_tensor = self._process(infos["admissible_commands"], self.tokenizer)

        # Get model's output - scores for each command and value estimation
        outputs, indexes, value = self.model(input_tensor, commands_tensor)
        chosen_action_index = tf.squeeze(indexes).numpy()


        action = infos["admissible_commands"][chosen_action_index]
        #print(action)
        # test
        if self.mode == "test":
            if done:
                print('DONE')
                #self.model.reset_hidden(1)
            return action

        # Training logic
        self.no_train_step += 1

        if self.transitions:
            # Calculate reward
            reward = score - self.last_score
            self.last_score = score
            if infos["won"]:
                reward += 100
            if infos["lost"]:
                reward -= 100
            print(reward)
            # Update the last transition with the calculated reward
            self.transitions[-1][0] = reward

        # Store transition for training
        self.transitions.append([None, indexes, outputs, value])
        # Perform training at specified frequency
        if self.no_train_step % self.UPDATE_FREQUENCY == 0:
            self._train_loop()

        if done:
            self.last_score = 0
            self.model.reset_hidden(1)

        return action





In [58]:
nagent = NeuralAgent()
nagent.train()
play(nagent, 'games/tw-rewardsDense_goalDetailed.z8', max_step=50, nb_episodes=2)

tw-rewardsDense_goalDetailed.z80
0
0
0
0
0
0
0
0
TRAIN_MODEL METHOD
[[0, <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[3]])>, <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[0.00087634, 0.00087634, 0.00087634, 0.00087634, 0.00087634,
        0.00087634, 0.00087634, 0.00087634]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00642286]], dtype=float32)>], [0, <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[6]])>, <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[0.00087634, 0.00087634, 0.00087634, 0.00087634, 0.00087634,
        0.00087634, 0.00087634, 0.00087634]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00642286]], dtype=float32)>], [0, <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[6]])>, <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[0.00087634, 0.00087634, 0.00087634, 0.00087634, 0.00087634,
        0.00087634, 0.00087634, 0.00087634]], dtype=float32)>, <tf.Tensor: shape=(1, 1), 

ValueError: ignored

In [13]:
rangent = RandomAgent()
play(rangent, 'games/tw-rewardsDense_goalDetailed.z8', max_step=50, nb_episodes=10)

tw-rewardsDense_goalDetailed.z8..........  	avg. steps:  50.0; avg. score:  2.6 / 10.


In [None]:
# torch code
import re
from typing import List, Mapping, Any, Optional
from collections import defaultdict

import numpy as np

import textworld
import textworld.gym
from textworld import EnvInfos

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class CommandScorer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CommandScorer, self).__init__()
        torch.manual_seed(42)  # For reproducibility
        self.embedding    = nn.Embedding(input_size, hidden_size)
        self.encoder_gru  = nn.GRU(hidden_size, hidden_size)
        self.cmd_encoder_gru  = nn.GRU(hidden_size, hidden_size)
        self.state_gru    = nn.GRU(hidden_size, hidden_size)
        self.hidden_size  = hidden_size
        self.state_hidden = torch.zeros(1, 1, hidden_size, device=device)
        self.critic       = nn.Linear(hidden_size, 1)
        self.att_cmd      = nn.Linear(hidden_size * 2, 1)

    def forward(self, obs, commands, **kwargs):
        input_length = obs.size(0)
        batch_size = obs.size(1)
        nb_cmds = commands.size(1)

        embedded = self.embedding(obs)
        encoder_output, encoder_hidden = self.encoder_gru(embedded)
        state_output, state_hidden = self.state_gru(encoder_hidden, self.state_hidden)
        self.state_hidden = state_hidden
        value = self.critic(state_output)

        # Attention network over the commands.
        cmds_embedding = self.embedding.forward(commands)
        _, cmds_encoding_last_states = self.cmd_encoder_gru.forward(cmds_embedding)  # 1 x cmds x hidden

        # Same observed state for all commands.
        cmd_selector_input = torch.stack([state_hidden] * nb_cmds, 2)  # 1 x batch x cmds x hidden

        # Same command choices for the whole batch.
        cmds_encoding_last_states = torch.stack([cmds_encoding_last_states] * batch_size, 1)  # 1 x batch x cmds x hidden

        # Concatenate the observed state and command encodings.
        cmd_selector_input = torch.cat([cmd_selector_input, cmds_encoding_last_states], dim=-1)

        # Compute one score per command.
        scores = F.relu(self.att_cmd(cmd_selector_input)).squeeze(-1)  # 1 x Batch x cmds

        probs = F.softmax(scores, dim=2)  # 1 x Batch x cmds
        index = probs[0].multinomial(num_samples=1).unsqueeze(0) # 1 x batch x indx
        return scores, index, value

    def reset_hidden(self, batch_size):
        self.state_hidden = torch.zeros(1, batch_size, self.hidden_size, device=device)


class NeuralAgent:
    """ Simple Neural Agent for playing TextWorld games. """
    MAX_VOCAB_SIZE = 1000
    UPDATE_FREQUENCY = 10
    LOG_FREQUENCY = 1000
    GAMMA = 0.9

    def __init__(self) -> None:
        self._initialized = False
        self._epsiode_has_started = False
        self.id2word = ["<PAD>", "<UNK>"]
        self.word2id = {w: i for i, w in enumerate(self.id2word)}

        self.model = CommandScorer(input_size=self.MAX_VOCAB_SIZE, hidden_size=128)
        self.optimizer = optim.Adam(self.model.parameters(), 0.00003)

        self.mode = "test"

    def train(self):
        self.mode = "train"
        self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}
        self.transitions = []
        self.model.reset_hidden(1)
        self.last_score = 0
        self.no_train_step = 0

    def test(self):
        self.mode = "test"
        self.model.reset_hidden(1)

    @property
    def infos_to_request(self) -> EnvInfos:
        return EnvInfos(description=True, inventory=True, admissible_commands=True,
                        won=True, lost=True)

    def _get_word_id(self, word):
        if word not in self.word2id:
            if len(self.word2id) >= self.MAX_VOCAB_SIZE:
                return self.word2id["<UNK>"]

            self.id2word.append(word)
            self.word2id[word] = len(self.word2id)

        return self.word2id[word]

    def _tokenize(self, text):
        # Simple tokenizer: strip out all non-alphabetic characters.
        text = re.sub("[^a-zA-Z0-9\- ]", " ", text)
        word_ids = list(map(self._get_word_id, text.split()))
        return word_ids

    def _process(self, texts):
        texts = list(map(self._tokenize, texts))
        max_len = max(len(l) for l in texts)
        padded = np.ones((len(texts), max_len)) * self.word2id["<PAD>"]

        for i, text in enumerate(texts):
            padded[i, :len(text)] = text

        padded_tensor = torch.from_numpy(padded).type(torch.long).to(device)
        padded_tensor = padded_tensor.permute(1, 0) # Batch x Seq => Seq x Batch
        return padded_tensor

    def _discount_rewards(self, last_values):
        returns, advantages = [], []
        R = last_values.data
        for t in reversed(range(len(self.transitions))):
            rewards, _, _, values = self.transitions[t]
            R = rewards + self.GAMMA * R
            adv = R - values
            returns.append(R)
            advantages.append(adv)

        return returns[::-1], advantages[::-1]

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> Optional[str]:

        # Build agent's observation: feedback + look + inventory.
        input_ = "{}\n{}\n{}".format(obs, infos["description"], infos["inventory"])


        # Tokenize and pad the input and the commands to chose from.
        input_tensor = self._process([input_])
        commands_tensor = self._process(infos["admissible_commands"])

        #print(commands_tensor)
        # Get our next action and value prediction.
        outputs, indexes, values = self.model(input_tensor, commands_tensor)
        action = infos["admissible_commands"][indexes[0]]

        if self.mode == "test":
            if done:
                self.model.reset_hidden(1)
            return action

        self.no_train_step += 1

        if self.transitions:
            reward = score - self.last_score  # Reward is the gain/loss in score.
            self.last_score = score
            if infos["won"]:
                reward += 100
            if infos["lost"]:
                reward -= 100

            self.transitions[-1][0] = reward  # Update reward information.

        self.stats["max"]["score"].append(score)
        if self.no_train_step % self.UPDATE_FREQUENCY == 0:
            # Update model
            returns, advantages = self._discount_rewards(values)

            loss = 0
            for transition, ret, advantage in zip(self.transitions, returns, advantages):
                reward, indexes_, outputs_, values_ = transition

                advantage        = advantage.detach() # Block gradients flow here.
                probs            = F.softmax(outputs_, dim=2)
                log_probs        = torch.log(probs)
                log_action_probs = log_probs.gather(2, indexes_)
                policy_loss      = (-log_action_probs * advantage).sum()
                value_loss       = (.5 * (values_ - ret) ** 2.).sum()
                entropy     = (-probs * log_probs).sum()
                loss += policy_loss + 0.5 * value_loss - 0.1 * entropy

                self.stats["mean"]["reward"].append(reward)
                self.stats["mean"]["policy"].append(policy_loss.item())
                self.stats["mean"]["value"].append(value_loss.item())
                self.stats["mean"]["entropy"].append(entropy.item())
                self.stats["mean"]["confidence"].append(torch.exp(log_action_probs).item())

            if self.no_train_step % self.LOG_FREQUENCY == 0:
                msg = "{:6d}. ".format(self.no_train_step)
                msg += "  ".join("{}: {: 3.3f}".format(k, np.mean(v)) for k, v in self.stats["mean"].items())
                msg += "  " + "  ".join("{}: {:2d}".format(k, np.max(v)) for k, v in self.stats["max"].items())
                msg += "  vocab: {:3d}".format(len(self.id2word))
                print(msg)
                self.stats = {"max": defaultdict(list), "mean": defaultdict(list)}

            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), 40)
            self.optimizer.step()
            self.optimizer.zero_grad()

            self.transitions = []
            self.model.reset_hidden(1)
        else:
            # Keep information about transitions for Truncated Backpropagation Through Time.
            self.transitions.append([None, indexes, outputs, values])  # Reward will be set on the next call

        if done:
            self.last_score = 0  # Will be starting a new episode. Reset the last score.

        return action

# some code that works but i dont understand yet

In [None]:
# create 7 games
# Same as !make_games.sh
!tw-make tw-simple --rewards dense    --goal detailed --seed 18 --test --silent -f --output games/tw-rewardsDense_goalDetailed.z8
!tw-make tw-simple --rewards balanced --goal detailed --seed 18 --test --silent -f --output games/tw-rewardsBalanced_goalDetailed.z8
!tw-make tw-simple --rewards sparse   --goal detailed --seed 18 --test --silent -f --output games/tw-rewardsSparse_goalDetailed.z8
!tw-make tw-simple --rewards dense    --goal brief    --seed 18 --test --silent -f --output games/tw-rewardsDense_goalBrief.z8
!tw-make tw-simple --rewards balanced --goal brief    --seed 18 --test --silent -f --output games/tw-rewardsBalanced_goalBrief.z8
!tw-make tw-simple --rewards sparse   --goal brief    --seed 18 --test --silent -f --output games/tw-rewardsSparse_goalBrief.z8
!tw-make tw-simple --rewards sparse   --goal none     --seed 18 --test --silent -f --output games/tw-rewardsSparse_goalNone.z8

In [None]:
!tw-make tw-simple --rewards dense    --goal detailed --seed 18 --test --silent -f --output games/tw-rewardsDense_goalDetailed.z8

In [None]:
# building a random baseline
class RandomAgent(textworld.gym.Agent):
    """ Agent that randomly selects a command from the admissible ones. """
    def __init__(self, seed=1234):
        self.seed = seed
        self.rng = np.random.RandomState(self.seed)

    @property
    def infos_to_request(self) -> textworld.EnvInfos:
        return textworld.EnvInfos(admissible_commands=True)

    def act(self, obs: str, score: int, done: bool, infos: Mapping[str, Any]) -> str:
        return self.rng.choice(infos["admissible_commands"])

In [None]:
# play function
def play(agent, path, max_step=30, nb_episodes=10, verbose=True):
    # torch.manual_seed(20211021)  # For reproducibility when using action sampling.

    infos_to_request = agent.infos_to_request
    infos_to_request.max_score = True  # Needed to normalize the scores.

    gamefiles = [path]
    if os.path.isdir(path):
        gamefiles = glob(os.path.join(path, "*.z8"))

    env_id = textworld.gym.register_games(gamefiles,
                                          request_infos=infos_to_request,
                                          max_episode_steps=max_step)
    env = textworld.gym.make(env_id)  # Create a Gym environment to play the text game.
    if verbose:
        if os.path.isdir(path):
            print(os.path.dirname(path), end="")
        else:
            print(os.path.basename(path), end="")

    # Collect some statistics: nb_steps, final reward.
    avg_moves, avg_scores, avg_norm_scores = [], [], []
    for no_episode in range(nb_episodes):
        obs, infos = env.reset()  # Start new episode.

        score = 0
        done = False
        nb_moves = 0
        while not done:
            command = agent.act(obs, score, done, infos)
            obs, score, done, infos = env.step(command)
            nb_moves += 1

        agent.act(obs, score, done, infos)  # Let the agent know the game is done.

        if verbose:
            print(".", end="")
        avg_moves.append(nb_moves)
        avg_scores.append(score)
        avg_norm_scores.append(score / infos["max_score"])

    env.close()
    if verbose:
        if os.path.isdir(path):
            msg = "  \tavg. steps: {:5.1f}; avg. normalized score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_norm_scores), 1))
        else:
            msg = "  \tavg. steps: {:5.1f}; avg. score: {:4.1f} / {}."
            print(msg.format(np.mean(avg_moves), np.mean(avg_scores), infos["max_score"]))

In [None]:
# We report the score and steps averaged over 10 playthroughs.
play(RandomAgent(), "./games/tw-rewardsDense_goalDetailed.z8")    # Dense rewards

#play(RandomAgent(), "./games/tw-rewardsBalanced_goalDetailed.z8") # Balanced rewards
#play(RandomAgent(), "./games/tw-rewardsSparse_goalDetailed.z8")   # Sparse rewards

tw-rewardsDense_goalDetailed.z8..............................  	avg. steps:  30.0; avg. score:  2.3 / 10.


# misc (some old code that i dont want to throw away


# TENSORFLOW
# TODO NEED TO MAKE THE OUTPUT SIZE OF THE MODEL DYNAMIC
class CommandScorer(models.Model):
    def __init__(self, input_size, hidden_size):
        super(CommandScorer, self).__init__()
        self.embedding = layers.Embedding(input_size, hidden_size)
        self.encoder_gru = layers.GRU(hidden_size, return_sequences=True, return_state=True)
        self.cmd_encoder_gru = layers.GRU(hidden_size, return_state=True)
        self.state_gru = layers.GRU(hidden_size, return_state=True)
        #self.critic = layers.Dense(1) # TODO IMPLEMENT IT
        self.att_cmd = layers.Dense(1)

    def call(self, obs, commands):
        #print('COMMANDSCORER CALL METHOD')

        print('commands', commands.shape)
        embedded = self.embedding(obs)
        #embedded = tf.repeat(embedded, repeats=commands.shape[0], axis=0)
        print('embedded', embedded.shape)

        #encoder_output, encoder_hidden = self.encoder_gru(embedded)
        _, encoder_hidden = self.encoder_gru(embedded)
        #encoder_hidden = tf.expand_dims(encoder_hidden, axis=1)

        #print('encoder_output', encoder_output.shape)
        print('encoder_hidden', encoder_hidden.shape)

        cmds_embedding = self.embedding(commands) # (8, 3, 128)
        print('cmds_embedding', cmds_embedding.shape)
        # Initialize a list to store the encoding of each command
        cmds_encoding_last_states_list = []

        # Process each command independently
        for i in range(cmds_embedding.shape[0]):  # Iterate over each command
            # Treat each command as a single time-step sequence
            cmd = tf.expand_dims(cmds_embedding[i], axis=0)  # Shape: (1, 3, 128)
            _, cmd_encoding = self.cmd_encoder_gru(cmd)  # Shape: (1, 128)
            cmd_encoding = tf.reshape(cmd_encoding, (1, 1, -1))
            cmds_encoding_last_states_list.append(cmd_encoding)
            

        cmds_encoding_last_states = tf.concat(cmds_encoding_last_states_list, axis=1)  # Shape: (1, 8, 128)
        print('cmds_encoding_last_states', cmds_encoding_last_states.shape)


        # Reshape for batch processing: treat each command as a separate sequence
        #cmds_embedding_reshaped = tf.reshape(cmds_embedding, (-1, cmds_embedding.shape[2]))  # Shape: (24, 128)
        #cmds_embedding_reshaped = tf.expand_dims(cmds_embedding_reshaped, axis=0)  # Shape: (1, 24, 128)

        #print('cmds_embedding_reshaped', cmds_embedding_reshaped.shape)
        #_, cmds_encoding_last_states = self.cmd_encoder_gru(cmds_embedding_reshaped)  # Shape: (1, 128)
        
        # Reshape to get separate encodings for each command
        #cmds_encoding_last_states = tf.reshape(cmds_encoding_last_states, (1, 8, 128))  # Shape: (1, 8, 128)

        #print('cmds_encoding_last_states', cmds_encoding_last_states.shape)


        #cmds_encoding_last_states, _ = self.cmd_encoder_gru(cmds_embedding)
        #cmds_embedding_reshaped = tf.reshape(cmds_embedding, (1, -1, 128))  # Reshape to (1, 24, 128)

        #print('cmds_embedding', cmds_embedding.shape)
        #print('cmds_embedding_reshaped', cmds_embedding_reshaped.shape)

        #_, cmds_encoding_last_states = self.cmd_encoder_gru(cmds_embedding_reshaped)  # Shape: (1, 128)
        #print('cmds_encoding_last_states', cmds_encoding_last_states.shape)
        #cmds_encoding_last_states = tf.reshape(cmds_encoding_last_states, (1, 1, 8, 128))

        #print('cmds_encoding_last_states reshaped', cmds_encoding_last_states.shape)


        #state_output, state_hidden = self.state_gru(encoder_hidden)

        #print('state_output', state_output.shape)
        #print('state_hidden', state_hidden.shape)

        # Dynamically create scoring layer based on number of commands
        #num_commands = tf.shape(commands)[1] # ATTEMPT FOR DYNAMIC ACTION SPACE
        #att_cmd = layers.Dense(num_commands) # ATTEMPT FOR DYNAMIC ACTION SPACE

        # Repeat and concatenate for attention
        #state_hidden_repeated = tf.repeat(tf.expand_dims(state_hidden, axis=1), repeats=cmds_encoding_last_states.shape[1], axis=1)

        state_hidden_repeated = tf.repeat(tf.expand_dims(encoder_hidden, axis=1), repeats=8, axis=1)  # Shape: (1, 8, 128)
        #state_hidden_repeated = tf.expand_dims(state_hidden_repeated, axis=1) # Shape: (1, 1, 8, 256)


        print('state_hidden_repeated', state_hidden_repeated.shape)
        
        #cmds_encoding_last_states = tf.expand_dims(cmds_encoding_last_states, axis=2)  # Shape becomes [9, 128, 1]
        #cmd_selector_input = tf.concat([state_hidden_repeated, cmds_encoding_last_states], axis=-1)  # Now concatenation should work

        cmd_selector_input = tf.concat([state_hidden_repeated, cmds_encoding_last_states], axis=-1)  # Shape: (1, 1, 8, 256)

        #print('cmds_encoding_last_states', cmds_encoding_last_states.shape)
        print('cmd_selector_input', cmd_selector_input.shape)
        


        scores = self.att_cmd(cmd_selector_input) # DYNAMIC ATTEMPT
        scores = tf.squeeze(scores, axis=-1)

        # Calculate probabilities and sample an action
        probs = tf.nn.softmax(scores, axis=1)
        index = tf.random.categorical(tf.math.log(probs), num_samples=1)  # Use log probabilities for stability

        #probs = tf.nn.softmax(scores, axis=1)
        #probs = tf.squeeze(probs, axis=-1)
        #index = tf.random.categorical(probs, num_samples=1)

        return scores, index
