In [1]:
import sys
import openai
sys.path.insert(0, "../")
sys.path.insert(0, "../..")
from GenAgentsBoardroom.text_adventure_games.parsing import GptParser3
from GenAgentsBoardroom.test.game_setup import build_boardroom

game = build_boardroom(
    experiment_name="boardroom-test",
    experiment_id=1,
    num_characters=1,
    max_ticks=2,
    personas_path="game_personas",
    # random_placement=True,
)

parser = GptParser3(game, verbose=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kylesullivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kylesullivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


CHARACTER JSONS: []

GET OR CREATE BASE FACTS PLAYER DESCRIPTION: An quirky contestant that is must see TV on a reality show. True
RESPONSE: Name='Luna Willows' Age=27 Likes=['DIY crafts', 'plant collecting', 'hiking', 'improv theater', 'creating quirky outfits'] Dislikes=['routines', 'conflict', 'silence', 'negativity'] Occupation='Reality Show Contestant' Home_City='Portland, Oregon'
Generated facts: {'Name': 'Luna Willows', 'Age': 27, 'Likes': ['DIY crafts', 'plant collecting', 'hiking', 'improv theater', 'creating quirky outfits'], 'Dislikes': ['routines', 'conflict', 'silence', 'negativity'], 'Occupation': 'Reality Show Contestant', 'Home_City': 'Portland, Oregon'}
PERSONA: Luna Willows, a 27-year-old Reality Show Contestant. You are passionate about DIY crafts, plant collecting, hiking. You have aversions to routines, conflict, silence. Some key facts about yourself are: home_city, Portland, Oregon. Your overall game strategy is Backstabbing, reflecting your judgement which *tend

In [2]:
parser.extract_keywords("What have I been doing to achieve the goal of the game?")

{'objects': ['game', 'goal'],
 'other_named_entities': [],
 'misc_deps': [],
 'characters': []}

In [3]:
# Import necessary packages for lemmatization and singularization.
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from inflect import engine
import spacy
import string

In [4]:
# Initialize the lemmatizer and inflect engine.
lemmatizer = WordNetLemmatizer()
inflect_engine = engine()
nlp = spacy.load("en_core_web_sm")

In [5]:
# Function to standardize words by converting to lowercase, lemmatizing, and singularizing.
def standardize_word(word):
    word = word.lower()
    # Preserve hyphenated words
    if '-' in word:
        return word
    # Check if the word is a punctuation
    if word in string.punctuation or word in ["the", "a", "an"]:
        return None
    # Try lemmatizing with different parts of speech
    for pos in [wordnet.VERB, wordnet.NOUN, wordnet.ADJ, wordnet.ADV]:
        lemmatized_word = lemmatizer.lemmatize(word, pos)
        if lemmatized_word != word:  # If lemmatization changed the word
            word = lemmatized_word
            if pos == wordnet.NOUN:
                word = inflect_engine.singular_noun(word) or word
                break
    return word

In [6]:
from collections import defaultdict


def extract_keywords(text):
    if not text:
        return None
    custom_stopwords = {
        "he",
        "it",
        "i",
        "you",
        "she",
        "they",
        "we",
        "us",
        "'s",
        "this",
        "that",
        "these",
        "those",
        "them",
    }

    doc = nlp(text)
    keys = defaultdict(set)
    for w in doc:
        if w.text.lower() in custom_stopwords:
            continue

        if w.pos_ in ["PROPN"]:
            # compounds = [j for j in w.children if j.dep_ == "compound"]
            # if compounds:
            #     continue
            compound_parts = []
            for child in w.subtree:
                if child.text.lower() not in custom_stopwords:
                    if '-' in child.text:
                        compound_parts.append(child.text.lower())
                    else:
                        compound_parts.append(
                            " ".join(
                                [
                                    standardize_word(word)
                                    for word in child.text.split()
                                    if standardize_word(word)
                                ]
                            )
                        )

            compound_noun = " ".join(compound_parts)

            exists, name = False, None # self.check_if_character_exists(compound_noun)
            if exists:
                keys["characters"].add(name.lower())
            else:
                keys["misc_deps"].add(
                    " ".join(
                        [
                            standardize_word(word)
                            for word in compound_noun.split()
                            if standardize_word(word)
                        ]
                    )
                )

            # Process each word in the compound noun separately
            for part in compound_parts:
                exists, name = False, None # self.check_if_character_exists(part)
                if exists:
                    keys["characters"].add(name.lower())
                else:
                    keys["misc_deps"].add(part)
            continue

        if "subj" in w.dep_:
            exists, name = False, None  # self.check_if_character_exists(w.text)
            if exists:
                keys["characters"].add(name.lower())
            else:
                keys["misc_deps"].add(
                    " ".join(
                        [
                            standardize_word(word)
                            for word in w.text.split()
                            if standardize_word(word)
                        ]
                    )
                )
        if "obj" in w.dep_:
            exists, name = False, None  # self.check_if_character_exists(w.text)
            if exists:
                keys["characters"].add(name.lower())
            else:
                keys["objects"].add(
                    " ".join(
                        [
                            standardize_word(word)
                            for word in w.text.split()
                            if standardize_word(word)
                        ]
                    )
                )

    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE"]:
            exists, name = False, None  # self.check_if_character_exists(ent.text)
            if exists:
                keys["characters"].add(name.lower())
            else:
                keys["misc_deps"].add(
                    " ".join(
                        [
                            standardize_word(word)
                            for word in ent.text.split()
                            if standardize_word(word)
                        ]
                    )
                )

    keys = {k: list(v) for k, v in keys.items()}

    return keys

In [7]:
def extract_phrases_from_text(text):
    # # Replace all punctuation with spaces
    # text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.replace("-", " ").replace("–", " ")
    doc = nlp(text)

    # Collect all subtrees
    subtrees = []
    for token in doc:
        subtrees.append(list(token.subtree))
        print("SUBTREE:", [token.text for token in subtrees[-1]])

    # Initialize sets to collect noun, verb, proper noun phrases, named entities, objects, and subjects across all subtrees
    all_noun_phrases = set()  # Use a set to avoid duplicates
    all_verb_phrases = set()  # Use a set to avoid duplicates
    all_proper_nouns = set()  # Use a set to avoid duplicates
    all_named_entities = set()  # Set to collect named entities
    all_objects = set()  # Set to collect objects
    all_subjects = set()  # Set to collect subjects

    # Process each subtree
    for subtree in subtrees:
        # print("SUBTREE:", [token.text for token in subtree])
        for token in subtree:
            if token.pos_ == "NOUN":
                # Include the noun itself without adjectives
                if token.text.lower() not in ["the", "a", "an"]:  # Remove articles
                    all_noun_phrases.add(token.text)

                # Include noun phrases with adjectives
                noun_phrase = []
                for child in token.lefts:
                    if child.pos_ == "ADJ":
                        noun_phrase.append(child.text)
                if token.text.lower() not in ["the", "a", "an"]:  # Remove articles
                    noun_phrase.append(token.text)
                all_noun_phrases.add(" ".join(noun_phrase))

                # Check for compound nouns
                compound_noun = " ".join([child.text for child in token.subtree if child.dep_ in ["compound", "amod"]] + [token.text])
                if compound_noun != token.text and compound_noun.lower() not in ["the", "a", "an"]:  # Remove articles
                    all_noun_phrases.add(compound_noun)

                # Add to objects set
                if token.text.lower() not in ["the", "a", "an"]:  # Remove articles
                    all_objects.add(token.text)

            if token.pos_ == "VERB" and token.text.lower() not in ["go", "do", "try", "make", "take", "have", "get", "put", "come", "see", "say"]:  # Exclude simple verbs
                # Include the verb itself without adverbs
                all_verb_phrases.add(token.text)

            if token.pos_ == "PROPN":
                # Include the proper noun itself
                all_proper_nouns.add(token.text)

                # Include contiguous proper nouns
                proper_noun_phrase = []
                for child in token.lefts:
                    if list(child.rights) or list(child.lefts):
                        print("CHILD:", child.text)
                        print("CHILDREN:", list(child.lefts), list(child.rights))
                    if child.pos_ == "PROPN":
                        proper_noun_phrase.append(child.text)
                proper_noun_phrase.append(token.text)
                all_proper_nouns.add(" ".join(proper_noun_phrase))

            if "subj" in token.dep_:  # Check if the token is a subject
                all_subjects.add(token.text)

    # Extract named entities from the document
    for ent in doc.ents:
        # Remove articles from the entity text
        entity_text = ent.text.split()
        entity_text = [word for word in entity_text if word.lower() not in ["the", "a", "an"]]
        cleaned_entity_text = " ".join(entity_text)
        if cleaned_entity_text:  # Ensure it's not empty after removing articles
            all_named_entities.add(cleaned_entity_text)

    return (
        all_noun_phrases,
        all_verb_phrases,
        all_named_entities,
        all_objects,
        all_subjects,
        # all_proper_nouns,
    )  # Return the objects and subjects set as well

In [8]:
# noun_phrases, verb_phrases, named_entities, objects, subjects, = extract_phrases_from_text(
#     "Engage Fiona and Jasper Jensen in light-hearted philosophical discussions to build rapport."
#     "Then go to the Apple Store to buy a high-tech new Macbook Pro. Mark Zuckerberg is the CEO of Facebook."
#     "My goodness, these new computers are so expensive!"
#     "What shall I do?"
# )

# phrase_names = ["noun_phrases", "verb_phrases", "named_entities", "objects", "subjects", "proper_nouns"]
# for phrase_name, phrases in zip(phrase_names, [noun_phrases, verb_phrases, named_entities, objects, subjects]):
#     print(phrase_name)
#     print(phrases)
#     print('-'*200)


In [43]:
# original_text = "Engage with Fiona and Jasper Jensen in light-hearted philosophical discussions to build rapport. Then go to the Apple Store to buy a high-tech new Macbook Pro. Mark Zuckerberg is the CEO of Facebook."
# # original_text = (
# #     "Casey Johnson looked around at the surroundings at the camp and saw Luna Blake."
# # )
# noun_phrases, verb_phrases, named_entities, objects, subjects = extract_phrases_from_text(original_text)

# for text in [*noun_phrases, *verb_phrases, *named_entities, *objects, *subjects]:
#     print(text)
#     print(parser.extract_keywords(text))
#     print()

# print('-'*200)
# print(parser.extract_keywords(original_text))

In [9]:
extract_keywords(
    "Engage Fiona and Jasper Jensen in light-hearted philosophical discussions to build rapport. Then go to the Apple Store to buy a high-tech new Macbook Pro. Mark Zuckerberg is the CEO of Facebook. Don't forget the hot dogs!"
)

{'misc_deps': ['',
  'engage fiona',
  'hearted',
  'engage',
  'light',
  'tech',
  'facebook',
  'build',
  'in',
  'to',
  'mark',
  'fiona',
  'store',
  'macbook pro',
  'jasper jensen',
  'macbook',
  'zuckerberg',
  'high',
  'pro',
  'new',
  '-',
  'and',
  'apple',
  'jasper',
  'rapport',
  'high - tech new macbook pro mark zuckerberg',
  'jensen',
  'high - tech new macbook pro',
  'apple store',
  'mark zuckerberg',
  'discussion',
  'engage fiona and jasper jensen in light - hearted philosophical discussion to build rapport',
  'philosophical'],
 'objects': ['dog', 'rapport', 'discussion']}

In [10]:
parser.extract_keywords(
    "Engage Fiona and Jasper Jensen in light-hearted philosophical discussions to build rapport. Then go to the Apple Store to buy a high-tech new Macbook Pro. Mark Zuckerberg is the CEO of Facebook. Don't forget the hot dogs!"
)

{'misc_deps': ['hearted',
  'engage',
  'light',
  'tech',
  'facebook',
  'build',
  'mark',
  'fiona',
  'engage fiona jasper jensen light hearted philosophical discussion build rapport',
  'store',
  'jasper jensen',
  'macbook',
  'zuckerberg',
  'high',
  'pro',
  'apple',
  'jasper',
  'rapport',
  'jensen',
  'apple store',
  'high tech macbook pro mark zuckerberg',
  'discussion',
  'high tech macbook pro',
  'philosophical'],
 'objects': ['dog', 'rapport', 'discussion'],
 'other_named_entities': ['engage fiona', 'macbook pro', 'mark zuckerberg'],
 'characters': []}

In [3]:
parser.extract_keywords(
    "Casey Johnson looked around at the surroundings at the camp and saw Luna Blake."
)

{'misc_deps': ['casey johnson',
  'luna blake',
  'blake',
  'johnson',
  'luna',
  'casey'],
 'objects': ['camp', 'surroundings'],
 'other_named_entities': [],
 'characters': []}

In [5]:
parser.extract_keywords(
    "Casey Johnson survived the vote. Casey Johnson received 1 out of 3 votes. Lola Petunia was exiled from the game but now sits on the final jury, where they will be allowed to cast a vote to help determine the game winner."
)

{'misc_deps': ['casey johnson',
  'petunia',
  'lola petunia',
  'johnson',
  'casey',
  'lola'],
 'objects': ['winner', 'jury', 'vote', 'game'],
 'other_named_entities': [],
 'characters': []}

In [4]:
parser.extract_keywords(
    "Casey Johnson survived the vote. Casey Johnson received 1 out of 3 votes. Lola Petunia was exiled from the game but now sits on the final jury, where they will be allowed to cast a vote to help determine the game winner."
)

{'misc_deps': ['casey johnson',
  'petunia',
  'lola petunia',
  'johnson',
  'casey',
  'lola'],
 'objects': ['winner', 'game', 'vote', 'jury'],
 'other_named_entities': [],
 'characters': []}

In [None]:
# Here is an example:

# Input:
# "Engage Fiona and Jasper in light-hearted philosophical discussions to build rapport."

# Output:
# {'misc_deps': ['philosophical',
#   'light',
#   'hearted',
#   'fiona',
#   'engage fiona jasper in light hearted philosophical discussions to build rapport',
#   'engage',
#   'jasper',
#   'discussion',
#   'build',
#   'to',
#   'rapport',
#   'in'],
#  'objects': ['discussion', 'rapport'],
#  'other_named_entities': ['engage fiona'],
#  'characters': []}

# I need to avoid including adjectives (e.g., 'light-hearted') and I need to avoid these long segments (e.g., 'engage fiona jasper in light hearted philosophical discussions to build rapport'). Only compound words should be kept together (they should also be processed separately).

In [12]:
parser.extract_keywords(
    "Engage Fiona and Jasper in light-hearted philosophical discussions to build rapport."
)

{'misc_deps': ['hearted',
  'engage',
  '-',
  'to',
  'discussion',
  'fiona',
  'rapport',
  'build',
  'in',
  'light',
  'philosophical',
  'engage fiona jasper in light - hearted philosophical discussion to build rapport'],
 'characters': ['jasper quinlan'],
 'objects': ['discussion', 'rapport'],
 'other_named_entities': ['engage fiona']}

In [6]:
parser.extract_keywords(
    "Engage Fiona and Jasper in light-hearted philosophical discussions to build rapport."
)

{'misc_deps': ['philosophical',
  'light',
  'hearted',
  'fiona',
  'engage fiona jasper in light hearted philosophical discussions to build rapport',
  'engage',
  'jasper',
  'discussion',
  'build',
  'to',
  'rapport',
  'in'],
 'objects': ['discussion', 'rapport'],
 'other_named_entities': ['engage fiona'],
 'characters': []}

In [7]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
def extract_keywords(self, text, actions=False):
    """
    Extracts keywords from the provided text, identifying characters, objects, and optionally actions.

    This method processes the input text using natural language processing to identify and categorize keywords, such
    as characters, objects, and actions, while filtering out common stopwords. It returns a dictionary containing sets of
    identified characters, objects, actions, and miscellaneous dependencies based on the text analysis.

    Args:
        text (str): The input text from which to extract keywords.
        actions (bool, optional): Whether to extract actions from the text. Defaults to False.

    Returns:
        dict: A dictionary with keys for "characters", "objects", "actions", "misc_deps", and "other_named_entities",
                each containing a list of identified keywords.
    """

    # Check if the input text is empty; if so, return None.
    if not text:
        return None

    # Define a set of custom stopwords to filter out common, non-informative words, including basic verbs.
    custom_stopwords = {
        "a",
        "an",
        "and",
        "he",
        "it",
        "i",
        "you",
        "she",
        "they",
        "we",
        "us",
        "'s",
        "this",
        "that",
        "these",
        "those",
        "them",
        "their",
        "my",
        "your",
        "our",
        "the",
        "is",
        "are",
        "was",
        "were",
        "be",
        "being",
        "been",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "say",
        "says",
        "said",
        "go",
        "goes",
        "went",
        "make",
        "makes",
        "made",
        "know",
        "knows",
        "knew",
        "think",
        "thinks",
        "thought",
        "take",
        "takes",
        "took",
        "see",
        "sees",
        "saw",
        "come",
        "comes",
        "came",
        "want",
        "wants",
        "wanted",
        "like",
        "likes",
        "liked",
    }

    # Import necessary packages for lemmatization and singularization.
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    from inflect import engine

    # Initialize the lemmatizer and inflect engine.
    lemmatizer = WordNetLemmatizer()
    inflect_engine = engine()

    # Function to standardize words by converting to lowercase, lemmatizing, and singularizing.
    def standardize_word(word):
        word = word.lower()
        # Try lemmatizing with different parts of speech
        for pos in [wordnet.VERB, wordnet.NOUN, wordnet.ADJ, wordnet.ADV]:
            lemmatized_word = lemmatizer.lemmatize(word, pos)
            if lemmatized_word != word:  # If lemmatization changed the word
                word = lemmatized_word
                if pos == wordnet.NOUN:
                    word = inflect_engine.singular_noun(word) or word
                    break
            return word

    # Process the input text using the natural language processing model.
    doc = self.nlp(text)

    # Initialize a defaultdict to store identified keywords categorized by type.
    keys = defaultdict(set)

    # Iterate over each word in the processed document.
    for w in doc:
        # Skip the word if it is in the custom stopwords set.
        if w.text.lower() in custom_stopwords:
            continue

        # Check if the word is a proper noun (PROPN).
        if w.pos_ in ["PROPN"]:
            # If the proper noun has compound words, handle the entire compound noun.
            compound_noun = " ".join(
                [
                    child.text
                    for child in w.subtree
                    if child.text.lower() not in custom_stopwords
                ]
            ).lower()
            exists, name = self.check_if_character_exists(compound_noun)
            if exists:
                # If the character exists, add it to the characters set.
                keys["characters"].add(name.lower())
            else:
                # If not, add the compound noun to miscellaneous dependencies.
                keys["misc_deps"].add(standardize_word(compound_noun))

            # Process each word in the compound noun separately.
            for part in compound_noun.split():
                exists, name = self.check_if_character_exists(part)
                if exists:
                    # If the character exists, add it to the characters set.
                    keys["characters"].add(name.lower())
                else:
                    # If not, add the word to miscellaneous dependencies.
                    keys["misc_deps"].add(standardize_word(part))
            continue

        # Check if the word is a subject in the dependency parse.
        if "subj" in w.dep_:
            exists, name = self.check_if_character_exists(w.text.lower())
            if exists:
                # If the character exists, add it to the characters set.
                keys["characters"].add(name.lower())
            else:
                # If not, add the word to miscellaneous dependencies.
                keys["misc_deps"].add(standardize_word(w.text))

        # Check if the word is an object in the dependency parse.
        if "obj" in w.dep_:
            exists, name = self.check_if_character_exists(w.text.lower())
            if exists:
                # If the character exists, add it to the characters set.
                keys["characters"].add(name.lower())
            else:
                # If not, add the word to the objects set.
                keys["objects"].add(standardize_word(w.text))

        if actions:
            # Check if the word is a verb (action).
            if w.pos_ == "VERB":
                keys["actions"].add(standardize_word(w.text))

    # Iterate over named entities in the document.
    for ent in doc.ents:
        # Check if the entity is a person, organization, or geopolitical entity.
        if ent.label_ in ["PERSON", "ORG", "GPE"]:
            exists, name = self.check_if_character_exists(ent.text.lower())
            if exists:
                # If the character exists, add it to the characters set.
                keys["characters"].add(name.lower())
            else:
                cleaned_entity = " ".join(
                    [
                        standardize_word(w.text)
                        for word in ent.text.split()
                        if word.lower() not in custom_stopwords
                    ]
                ).lower()
                keys["other_named_entities"].add(cleaned_entity)

    # Remove duplicates between 'misc_deps' and 'other_named_entities'
    keys["other_named_entities"] = keys["other_named_entities"] - keys["misc_deps"]

    # Remove duplicates between 'characters' and 'other_named_entities'
    keys["other_named_entities"] = keys["other_named_entities"] - keys["characters"]

    # Remove duplicates between 'objects' and 'other_named_entities'
    keys["other_named_entities"] = keys["other_named_entities"] - keys["objects"]

    # Convert the sets in the keys dictionary to lists for easier handling.
    keys = {k: list(v) for k, v in keys.items()}

    # Return the dictionary containing categorized keywords.
    return keys