In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transfer_learning_conversational import SPECIAL_TOKENS, ATTR_TO_SPECIAL_TOKEN, add_special_tokens, build_input_from_segments, get_dataset
from build_raw_data import get_bios
from itertools import chain
import torch
import torch.nn.functional as F
import random
import pandas as pd
import nltk.data
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else "cpu" 

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('./models/dialoGPT')
model = GPT2LMHeadModel.from_pretrained('./models/dialoGPT')
model.to(device)
add_special_tokens(model,tokenizer, ATTR_TO_SPECIAL_TOKEN)

Some weights of the model checkpoint at ./models/dialoGPT were not used when initializing GPT2LMHeadModel: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
bios = get_bios()

In [5]:
temp = .7
min_length = 5
max_length = 50
no_sample = True

In [6]:
def top_filtering(logits, top_k=0., top_p=0.6, threshold=-float('Inf'), filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    assert logits.dim() == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits

In [7]:
def sample_sequence(personality, history, tokenizer, model):
    special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
    current_output = []
    for i in range(max_length):
        instance =  build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False)
        input_ids = torch.tensor(instance["input_ids"], device=device).unsqueeze(0)
        token_type_ids = torch.tensor(instance["token_type_ids"], device=device).unsqueeze(0)
        output = model(input_ids, token_type_ids=token_type_ids)
        logits = output.logits
        if isinstance(logits, tuple):  # for gpt2 and maybe others
            logits = logits[0]
        logits = logits[0, -1, :] / temp
        logits = top_filtering(logits)
        probs = F.softmax(logits, dim=-1)
        
        prev = torch.topk(probs, 1)[1] if no_sample else torch.multinomial(probs, 1)
        if i < min_length and prev.item() in special_tokens_ids:
            while prev.item() in special_tokens_ids:
                if probs.max().item() == 1:
                    break  # avoid infinitely looping over special token
                prev = torch.multinomial(probs, num_samples=1)

        if prev.item() in special_tokens_ids:
            break
        current_output.append(prev.item())

    return current_output


In [8]:
personalities = pd.read_csv("../../data/generative_model_output.csv")

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def format_input(text:str) -> str:
    """
    Takes a raw text and formats for the model input. For dialogue and personality, this means all lowercase with a space before any punctuation.
    Args:
        text: raw text input, cam be dialogue or bio
    Returns:
        text: text formatted for the model
    """
    text = text.lower()
    text = re.sub('([.,!?()])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)
    text = text.strip()
    return(text)
def tokenize(obj):
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict):
        return dict((n, tokenize(o)) for n, o in obj.items())
    return list(tokenize(o) for o in obj)


In [48]:
full_bios = list(personalities['bio'] + personalities['generated_bio'])
full_bios = [bio.lower().split("\n",1)[0] for bio in full_bios]
full_bios = [sent_tokenizer.tokenize(bio) for bio in full_bios]
full_bios = [[format_input(sent) for sent in sents] for sents in full_bios]
full_bios = tokenize(full_bios)

In [101]:
personality = random.choice(full_bios)
raw_personality = tokenizer.decode(chain(*personality))
name = raw_personality.split(" ",1)[0]
print(raw_personality)

mateo is a skilled carpenter who runs his own woodworking business.he has been building old furniture for the last 25 years and loves his craft.he also has a passion for painting and has been known to create amazing murals.


In [103]:
history = []
history.append(tokenizer.encode(f"tell me about yourself "))

with torch.no_grad():
    output = sample_sequence(personality, history, tokenizer, model)
    out_text = tokenizer.decode(output, skip_special_tokens=True)
print(out_text)

well, i'm a carpenter and i run my own woodworking business. i love painting and creating murals. i also love painting and creating new furniture for my customers. 


In [104]:
history.append(tokenizer.encode(out_text))
history.append(tokenizer.encode(" ?"))

with torch.no_grad():
    output = sample_sequence(personality, history, tokenizer, model)
    out_text = tokenizer.decode(output, skip_special_tokens=True)
print(out_text)

hey there, i'm in need of 10 fresh carp for my latest woodworking project, could you help me out? 


In [16]:
history.append(tokenizer.encode(out_text))
history.append(tokenizer.encode("what would you like me to gather ?"))

with torch.no_grad():
    output = sample_sequence(personality, history, tokenizer, model)
    out_text = tokenizer.decode(output, skip_special_tokens=True)
print(out_text)

i would like to make a sweater for you


In [105]:
import json
li = []
with open("../../data/knowledge_base/kb.json", "r") as f:
    kb = json.loads(f.read())


for key in kb.keys():
    if key == "locations":
        continue
    elif key == "mobs":
        for mob in kb[key]:
            personality = random.choice(full_bios)
            raw_personality = tokenizer.decode(chain(*personality))
            
            history = []
            first_prompt = f"give me a quest to slay {mob}s ."
            history.append(tokenizer.encode(first_prompt))
            with torch.no_grad():
                output = sample_sequence(personality, history, tokenizer, model)
                first_output = tokenizer.decode(output, skip_special_tokens=True)
                print(first_output)
                li.append(pd.DataFrame([[first_prompt,first_output]], columns=["prompt", "output"]))
    else:
        for item in kb[key]:
            personality = random.choice(full_bios)
            raw_personality = tokenizer.decode(chain(*personality))
            
            history = []
            first_prompt = f"give me a quest to collect {item} {key} ."
            history.append(tokenizer.encode(first_prompt))
            with torch.no_grad():
                output = sample_sequence(personality, history, tokenizer, model)
                first_output = tokenizer.decode(output, skip_special_tokens=True)
                print(first_output)
                li.append(pd.DataFrame([[first_prompt,first_output]], columns=["prompt", "output"]))

test_set = pd.concat(li)

hey there, i'm natalie, and i'm in need of some fresh cranberries for my latest fashion project, could you help me out? 
hey there, i heard you're quite the adventurer, would you be willing to collect 10 pieces of the freshest and most flavorful pumpkin spice for me?  - kian.
hey there, i heard you're quite the adventurer, would you be willing to collect 10 pieces of bixite for me? i need them for a new art project i'm working on. 
hey there, i've been looking for some fresh and delicious fall leaves to use in my latest project, could you bring me 10 of them?  - george.
hey there, farmer! i heard you're quite the adventurer, could you bring me 10 of them for a special recipe i'm working on?  - diana.
hey there, i heard you're quite the adventurer, would you be willing to collect 10 cranberry pips for me? i need them for a new recipe i'm working on. 
hey there, i heard you're quite the adventurer, could you bring me 10 of them for a new jazz club i'm working on?  - max.
hey there, i hea

In [106]:
test_set.to_csv("../../data/convo_tests/quests_outputs_4_2.csv")