In [1]:
import os
import re
import random
from typing import List, Dict, Any, Optional
from collections import defaultdict
from time import time
from glob import glob
import numpy as np
import gym
from textworld import EnvInfos
import textworld.gym

# for text similarity
import spacy
import wmd
import en_core_web_md

#import torch

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
gamefiles = []
GAMES_PATH = "./sample_games/"
for file in os.listdir(GAMES_PATH):
    if file.endswith('.ulx'):
        gamefiles.append(GAMES_PATH + file)

## Play Function
- create gym environment to let the agent play textworld

In [3]:
def play(agent, path, max_step=50, nb_episodes=10, verbose=True):
    request_infos = agent.select_additional_infos
    request_infos.max_score = True  # Needed to normalize the scores.

    gamefiles = [path]
    if os.path.isdir(path):
        gamefiles = glob(os.path.join(path, "*.ulx"))

    env_id = textworld.gym.register_games(gamefiles,
                                          request_infos=request_infos,
                                          max_episode_steps=max_step)
    env = gym.make(env_id)  # Create a Gym environment to play the text game.
    if verbose:
        if os.path.isdir(path):
            print(os.path.dirname(path), end="")
        else:
            print(os.path.basename(path), end="")

    # Collect some statistics: nb_steps, final reward.
    avg_moves, avg_scores, avg_norm_scores = [], [], []
    for no_episode in range(nb_episodes):
        obs, infos = env.reset()  # Start new episode.

        score = 0
        done = False
        nb_moves = 0
        while not done:
            # the agent chooses an action
            command = agent.act(obs, score, done, infos)
            # carry out action and update parameters
            obs, score, done, infos = env.step(command)
            nb_moves += 1

        # Let the agent know the game is done.
        agent.act(obs, score, done, infos)

        if verbose:
            print(".", end="")
        avg_moves.append(nb_moves)
        avg_scores.append(score)
        avg_norm_scores.append(score / infos["max_score"])

    env.close()
    msg = "  \tavg. steps: {:5.1f}; avg. score: {:4.1f} / {}."
    if verbose:
        if os.path.isdir(path):
            print(msg.format(np.mean(avg_moves), np.mean(avg_norm_scores), 1))
        else:
            print(msg.format(np.mean(avg_moves), np.mean(
                avg_scores), infos["max_score"]))

In [14]:
nlp = en_core_web_md.load()
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)

## Custom Agent
- uses Handicap 5 (admissible commands from EnvInfos)
- saves a "qvalue" for each word in vocab.txt
- admissible commands get ranked by the average qvalue of their words
- "qvalues" get updated while the agent is playing depending on scores
    - calculate the reward by subtracting last round's score from this round's score
    - if the command led to a win reward each word an extra +10
    - if the command led to a defeat reward each word an extra -10
    - if the command did not get any reward, add a penalty of -1 (necessary to avoid using the same command over and over again)

In [103]:
class CustomAgent():
    
    def __init__(self) -> None:
        self._initialized = False
        self._epsiode_has_started = False
        self.mode = "test"
        self.command_history = []
        self.history_counter = 0
        self.nlp = nlp
        
        # read in vocab.txt and map to id
        with open("./vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        self.word2id = {} #dictionary for converting words to ids
        self.id2word = [] #list for converting ids to words
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
            self.id2word.append(w)
        
        self.qvalues = [0] * len(self.id2word) #rewards for words

    def train(self) -> None:
        self.mode = "train"
        self.transitions = []
        self.last_score = 0
        self.no_train_step = 0
        self.moves = 0
        self.command_history = []
        self.history_counter = 0

    def test(self) -> None:
        self.mode = "test"

    @property
    def select_additional_infos(self) -> EnvInfos:
        """
        Returns what additional information should be made available at each game step.

        Requested information will be included within the `infos` dictionary
        passed to `CustomAgent.act()`. To request specific information, create a
        :py:class:`textworld.EnvInfos <textworld.envs.wrappers.filter.EnvInfos>`
        and set the appropriate attributes to `True`. The possible choices are:

        * `description`: text description of the current room, i.e. output of the `look` command;
        * `inventory`: text listing of the player's inventory, i.e. output of the `inventory` command;
        * `max_score`: maximum reachable score of the game;
        * `objective`: objective of the game described in text;
        * `entities`: names of all entities in the game;
        * `verbs`: verbs understood by the the game;
        * `command_templates`: templates for commands understood by the the game;
        * `admissible_commands`: all commands relevant to the current state;

        In addition to the standard information, game specific information
        can be requested by appending corresponding strings to the `extras`
        attribute. For this competition, the possible extras are:

        * `'recipe'`: description of the cookbook;
        * `'walkthrough'`: one possible solution to the game (not guaranteed to be optimal);

        Example:
            Here is an example of how to request information and retrieve it.

            >>> from textworld import EnvInfos
            >>> request_infos = EnvInfos(description=True, inventory=True, extras=["recipe"])
            ...
            >>> env = gym.make(env_id)
            >>> ob, infos = env.reset()
            >>> print(infos["description"])
            >>> print(infos["inventory"])
            >>> print(infos["extra.recipe"])

            Handicap is defined as follows
                max_score, has_won, has_lost,               # Handicap 0
                description, inventory, verbs, objective,   # Handicap 1
                command_templates,                          # Handicap 2
                entities,                                   # Handicap 3
                extras=["recipe"],                          # Handicap 4
                admissible_commands,                        # Handicap 5
        """
        return EnvInfos(description=True, inventory=True, max_score = True, 
                        admissible_commands=True, has_won=True, has_lost=True, extras=["recipe"])

    def _init(self) -> None:
        """ Initialize the agent. """
        self._initialized = True

        # [You can insert code here.]
    
    def _get_word_id(self, word):
        """
        look up id of a word in dictionary & add word to dictionaries if unknown.
        """
        if word not in self.word2id:        
            self.id2word.append(word)
            self.qvalues.append(0)
            self.word2id[word] = len(self.word2id)
        return self.word2id[word]

    
    def _get_word_by_id(self, id):
        """
        look up word by id in dictionary.
        returns None, if id does not exist.
        """
        if id < len(self.id2word):
            return self.id2word[id]
        else:
            return None
    
    def _get_qvalue(self, id):
        """
        Getter for qvalues (words with scores).
        """
        if id < len(self.qvalues):
            return self.qvalues[id]
        else:
            return None
    
    def _tokenize(self, text):
        """
        Simple tokenizer: strip out all non-alphabetic characters.
        """
        text = re.sub("[^a-zA-Z0-9\- ]", " ", text)
        word_ids = list(map(self._get_word_id, text.split()))
        return word_ids
    
    def _start_episode(self, obs: List[str], infos: Dict[str, List[Any]]) -> None:
        """
        Prepare the agent for the upcoming episode.
        Arguments:
            obs: Initial feedback for each game.
            infos: Additional information for each game.
        """
        if not self._initialized:
            self._init()

        self._epsiode_has_started = True

        # [You can insert code here.]

    def _end_episode(self, obs: List[str], scores: List[int], infos: Dict[str, List[Any]]) -> None:
        """
        Tell the agent the episode has terminated.
        Arguments:
            obs: Previous command's feedback for each game.
            score: The score obtained so far for each game.
            infos: Additional information for each game.
        """
        self._epsiode_has_started = False

        # [You can insert code here.]
    
    def tok_to_nlp(self, tok1: List[int]):
        """ convert list of word ids into nlp string"""
        text = ""
        for id in tok1:
            text += self._get_word_by_id(id) + " "
        text = self.nlp(text)
        return text
    
    def calc_similarity(self, tok1: List[int], tok2: List[int]) -> float:
        """calculate the similarity between two nlp strings"""
        text1 = self.tok_to_nlp(tok1)
        text2 = self.tok_to_nlp(tok2)
        sim = 0.0
        # sometimes error when cost is negative...
        try:
            # usually 0.0 is best, and the worst i got was about 9
            # so normalized by 10 and subtracted from 1 to get best at 1.0
            sim = 1-text1.similarity(text2)/10
        except:
            pass
        return sim
    
    def update_qvalues(self, command : List[int],reward):
        """
        Updates words with additional rewards.
        """
        for id in command:
            self.qvalues[id] += reward
    
    def calc_avg_qvalue(self, command) -> int:
        """
        Gets the average of all qvalues of words in a command as a sum.
        """
        sum = 0
        for id in command:
            sum += self._get_qvalue(id)
        sum = sum/ len(command)
        return sum

    def choose_best_cmd(self,commands, recipe_tok) -> int:
        """
        Gets randomly one of the best rewarded admissible commands.
        """
        avg_qvalues =  []
        for cmd in commands:
            qval = self.calc_avg_qvalue(cmd) + self.calc_similarity(cmd,recipe_tok)*10
            #print(str(self.tok_to_nlp(cmd)) + ": " +  str(self.calc_similarity(cmd,recipe_tok)))
            avg_qvalues.append(qval)
        max_value = max(avg_qvalues)
        max_commands = []
        for i in range(len(commands)):
            if(max_value == avg_qvalues[i]):
                max_commands.append(i)
        return random.choice(max_commands)
    
    def get_alternative_cmd(self, commands, recipe_tok) -> int:
        """
        Idea of an alternative command, if the current is used in the last 3 commands.
        Chooses the second best command.
        """
        avg_qvalues =  []
        for cmd in commands:
            qval = self.calc_avg_qvalue(cmd) + self.calc_similarity(cmd,recipe_tok)*10
            avg_qvalues.append(qval)
     
        first_max = max(avg_qvalues[0],avg_qvalues[1]) 
        second_max = min(avg_qvalues[0],avg_qvalues[1]) 

        for i in range(2,len(avg_qvalues)): 
            if avg_qvalues[i] > first_max: 
                second_max = first_max
                first_max=avg_qvalues[i] 
            else: 
                if avg_qvalues[i]>second_max: 
                    second_max=avg_qvalues[i] 

        #print("Second highest number is : ",str(second_max)) 
        return avg_qvalues.index(second_max)
        
                        
    def act(self, obs: str, score: int, done: bool, 
            infos: Dict[str, List[Any]]) -> Optional[List[str]]:
        """
        Acts upon the current list of observations.

        One text command must be returned for each observation.

        Arguments:
            obs: Previous command's feedback for each game.
            scores: The score obtained so far for each game.
            dones: Whether a game is finished.
            infos: Additional information for each game.

        Returns:
            Text commands to be performed (one per observation).
            If episode had ended (e.g. `all(dones)`), the returned
            value is ignored.

        Notes:
            Commands returned for games marked as `done` have no effect.
            The states for finished games are simply copy over until all
            games are done.
        """
        #print("obs in act: ", obs)
        #if all(dones):
        #   self._end_episode(obs, scores, infos)
        #    return  # Nothing to return.

        if not self._epsiode_has_started:
            self._start_episode(obs, infos)
        
        recipe = infos["extra.recipe"]
        #if self.moves == 0:
            #print(recipe)
        recipe_tok = self._tokenize(recipe)

        #choose best command based on "qvalues"
        cmds = []
        for i in range(len(infos["admissible_commands"])):
            cmds.append(self._tokenize(infos["admissible_commands"][i]))
        best_index = self.choose_best_cmd(cmds, recipe_tok)
        action = infos["admissible_commands"][best_index]
        
        #building a historylist of commands to get out of a loop of death o_o
        if self.command_history is not None:
            last_commands = self.command_history[-3:]
            
        # compare-counters for comparing actions from the 3 last moves.
        if self.history_counter > self.moves:
            same_command_in_list = True 
            counter_use_same_command = 0
            """ 
            If the current admissible command (action) is one of the last 3 commands, the while-loop
            will be skipped. 
            If the same random chosen command is going 2 times through the while-loop, after tahat, the loops
            will be skipped, too.
            """
            while same_command_in_list is True and counter_use_same_command < 3:
                try:
                    if last_commands.index(action):
                        #best_index = self.choose_best_cmd(cmds)
                        best_index = self.get_alternative_cmd(cmds,  recipe_tok)
                        action = infos["admissible_commands"][best_index]
                        counter_use_same_command += 1
                    else:
                        same_command_in_list = False
                except ValueError:
                    same_command_in_list = False
                    #print("last_commands List does not contain value")
        # adds valid action to the command-history
        self.command_history.append(action)
        
        if self.mode == "test":
            return action
        
        #train mode, counter update
        self.no_train_step += 1 
        reward = 0
        self.moves += 1
        self.history_counter += 1
        
        #calculate rewards
        if self.transitions:
            reward = score - self.last_score
            self.last_score = score
            if infos["has_won"]:
                reward += 10
            if infos["has_lost"]:
                reward -= 10
            if reward == 0:
                reward -= 1
            # update rewards for command from last step
            self.update_qvalues(self.transitions[-1],reward)

        # Debug output
        if(self.no_train_step % 1 == 0):
            print("")
            #print("Last 3 admissible commands: ", last_commands)
            print("train step:" + str(self.no_train_step))
            print("last reward:" + str(reward))
            command = "last command: "
            if self.transitions:
                for id in self.transitions[-1]:
                    command += " " + self._get_word_by_id(id)
                print(command)
        
        #save last command in order to calculate rewards in next step
        self.transitions.append(cmds[best_index])
        
        if done:
            if(infos["has_won"]):
                print("-------- WON GAME ----------")
            else:
                print("-------- LOST GAME ----------")
            print(" ")
            self.last_score = 0
            self.moves = 0
            self.transitions = []
            self.command_history = []
            
        return action

In [104]:
# testing text similarity
agent = CustomAgent()
agent.train()
tok1= agent._tokenize("Cook a meal.")
tok2= agent._tokenize("Prepare the food.")
sim = agent.calc_similarity(tok1,tok2)
print(sim)

0.4282114982604981


## Training the Agent
- just training on the easiest game

In [106]:
agent = CustomAgent()
agent.train()
starttime = time()
for i in range(2):
    print("-------------------------------------")
    print("GAME: " + str(i+1))
    print("-------------------------------------")
    play(agent, gamefiles[5])
print("Trained in {:.2f} secs".format(time() - starttime))


-------------------------------------
GAME: 1
-------------------------------------
tw-cooking-recipe1+take1-11Oeig8bSVdGSp78.ulx
train step:1
last reward:0

train step:2
last reward:-1
last command:  examine red hot pepper

train step:3
last reward:-1
last command:  examine red hot pepper

train step:4
last reward:1
last command:  take red hot pepper from counter

train step:5
last reward:1
last command:  prepare meal

train step:6
last reward:-1
last command:  take yellow bell pepper from counter

train step:7
last reward:-1
last command:  drop meal

train step:8
last reward:-1
last command:  take meal

train step:9
last reward:-1
last command:  put meal on counter

train step:10
last reward:-1
last command:  cook yellow bell pepper with oven

train step:11
last reward:-1
last command:  cook yellow bell pepper with oven

train step:12
last reward:-1
last command:  look

train step:13
last reward:-1
last command:  look

train step:14
last reward:-1
last command:  examine red apple

tr

In [107]:
agent.test()
play(agent, gamefiles[5])

tw-cooking-recipe1+take1-11Oeig8bSVdGSp78.ulx..........  	avg. steps:   3.0; avg. score:  3.0 / 3.


In [108]:
for i in range(len(agent.qvalues)):
    if(agent.qvalues[i] != 0 ):
        print(agent._get_word_by_id(i) + " : " + str(agent.qvalues[i]))

apple : -2
bell : -3
close : -1
cook : -2
cookbook : -2
counter : 17
drop : -2
eat : 220
examine : -4
fridge : -2
from : 17
hot : 18
look : -2
meal : 236
on : -1
open : -1
oven : -2
pepper : 15
prepare : 20
put : -1
red : 16
table : -1
take : 16
with : -2
yellow : -3


In [109]:
agent.train()
starttime = time()
for i in range(2):
    print("-------------------------------------")
    print("GAME: " + str(i+1))
    print("-------------------------------------")
    play(agent, gamefiles[8])
print("Trained in {:.2f} secs".format(time() - starttime))

-------------------------------------
GAME: 1
-------------------------------------
tw-cooking-recipe2+take2+cut+open-BnYEixa9iJKmFZxO.ulx
train step:1
last reward:0

train step:2
last reward:-1
last command:  take red hot pepper from counter

train step:3
last reward:-1
last command:  eat red hot pepper

train step:4
last reward:-1
last command:  take red potato from counter

train step:5
last reward:-1
last command:  take cookbook from counter

train step:6
last reward:-1
last command:  take purple potato from counter

train step:7
last reward:1
last command:  take yellow potato from counter

train step:8
last reward:-10
last command:  eat yellow potato
-------- LOST GAME ----------
 
.
train step:9
last reward:0

train step:10
last reward:-1
last command:  take red hot pepper from counter

train step:11
last reward:-1
last command:  eat red hot pepper

train step:12
last reward:-1
last command:  take red potato from counter

train step:13
last reward:-1
last command:  take cookbook 


train step:127
last reward:-1
last command:  open fridge

train step:128
last reward:-1
last command:  examine cilantro

train step:129
last reward:-1
last command:  close fridge

train step:130
last reward:-1
last command:  take cookbook

train step:131
last reward:-1
last command:  drop cookbook

train step:132
last reward:-1
last command:  look

train step:133
last reward:-1
last command:  take knife from stove

train step:134
last reward:-1
last command:  put knife on table

train step:135
last reward:-1
last command:  take knife from table

train step:136
last reward:-1
last command:  put knife on counter

train step:137
last reward:-1
last command:  examine oven

train step:138
last reward:-1
last command:  look

train step:139
last reward:-1
last command:  open fridge

train step:140
last reward:-1
last command:  examine cilantro

train step:141
last reward:-1
last command:  close fridge
-------- LOST GAME ----------
 
.
train step:142
last reward:0

train step:143
last reward:


train step:257
last reward:-1
last command:  examine cilantro

train step:258
last reward:-1
last command:  close fridge

train step:259
last reward:-1
last command:  look

train step:260
last reward:-1
last command:  examine oven

train step:261
last reward:-1
last command:  examine stove

train step:262
last reward:-1
last command:  take red potato from counter

train step:263
last reward:-1
last command:  cook red potato with oven

train step:264
last reward:-1
last command:  eat red potato

train step:265
last reward:-1
last command:  look

train step:266
last reward:-1
last command:  open fridge

train step:267
last reward:-1
last command:  close fridge

train step:268
last reward:-1
last command:  take knife from table

train step:269
last reward:-1
last command:  put knife on table

train step:270
last reward:-1
last command:  look

train step:271
last reward:-1
last command:  examine stove

train step:272
last reward:-1
last command:  open fridge

train step:273
last reward:-1


train step:388
last reward:-1
last command:  put knife on stove

train step:389
last reward:-1
last command:  open fridge

train step:390
last reward:-1
last command:  examine yellow bell pepper

train step:391
last reward:-1
last command:  examine cilantro

train step:392
last reward:-1
last command:  close fridge
-------- LOST GAME ----------
 
.  	avg. steps:  38.2; avg. score:  0.4 / 6.
-------------------------------------
GAME: 2
-------------------------------------
tw-cooking-recipe2+take2+cut+open-BnYEixa9iJKmFZxO.ulx
train step:393
last reward:0

train step:394
last reward:-1
last command:  take red hot pepper from counter

train step:395
last reward:-1
last command:  eat red hot pepper

train step:396
last reward:-1
last command:  take cookbook from counter

train step:397
last reward:-1
last command:  drop cookbook

train step:398
last reward:-1
last command:  take purple potato from counter

train step:399
last reward:-1
last command:  cook purple potato with oven

train 

KeyboardInterrupt: 

In [13]:
# versuche
wordids = agent._tokenize("cook a meal")
print(wordids)

print(agent._get_word_id("cook"))
print(agent._get_word_by_id(4664))

[4664, 785, 11449]
4664
cook


In [78]:
def play_render(agent,gamefile):
    requested_infos = agent.select_additional_infos
    env_id = textworld.gym.register_games([gamefile], requested_infos)

    env = gym.make(env_id)
    obs, infos = env.reset()

    env.render()  # Print the initial observation.

    score = 0
    done = False
    while not done:
        command = agent.act(obs,score,done,infos)
        ob, score, done, infos = env.step(command)
        env.render()

In [79]:
agent.test()
play_render(agent, gamefiles[5])




                    ________  ________  __    __  ________
                   |        \|        \|  \  |  \|        \
                    \$$$$$$$$| $$$$$$$$| $$  | $$ \$$$$$$$$
                      | $$   | $$__     \$$\/  $$   | $$
                      | $$   | $$  \     >$$  $$    | $$
                      | $$   | $$$$$    /  $$$$\    | $$
                      | $$   | $$_____ |  $$ \$$\   | $$
                      | $$   | $$     \| $$  | $$   | $$
                       \$$    \$$$$$$$$ \$$   \$$    \$$
              __       __   ______   _______   __        _______
             |  \  _  |  \ /      \ |       \ |  \      |       \
             | $$ / \ | $$|  $$$$$$\| $$$$$$$\| $$      | $$$$$$$\
             | $$/  $\| $$| $$  | $$| $$__| $$| $$      | $$  | $$
             | $$  $$$\ $$| $$  | $$| $$    $$| $$      | $$  | $$
             | $$ $$\$$\$$| $$  | $$| $$$$$$$\| $$      | $$  | $$
             | $$$$  \$$$$| $$__/ $$| $$  | $$| $$_____ | $$__/ $$
          