***
### Imports and Globals
***

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric
from datasets.utils import logging
import torch

import numpy as np
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RANDOM_SEED = 99

DATA_FILES = 'casino' # | 'casino_w_task_data'

# DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

MODEL_DIR = 'model_saves'
MAX_INPUT_LEN = 1024  # Max length sequence in baseline_casino.csv is 824
MAX_OUTPUT_LEN = 192
# MODEL_NAME = "t5-base"
MODEL_NAME = "t5-small"

MODEL_STATE_DIR = './final_model/' # || './final_model_opponent_pref/'

***
### Creating the Model/Tokenizer
***

In [3]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_STATE_DIR)

tokenizer = T5Tokenizer.from_pretrained(MODEL_STATE_DIR, model_max_length=MAX_INPUT_LEN, eos_token='<EOS>')
special_tokens = {'additional_special_tokens': ['<CONTEXT>', '<HISTORY>', '<YOU>', '<THEM>']}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id=tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(32103, 512)

***
### Data Processing
***

In [None]:
# Construct Dataset
def preprocess_function(examples):
    # Tokenize inputs/outputs
    model_inputs = tokenizer(examples['input_seq'], max_length=MAX_INPUT_LEN, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['response'], max_length=MAX_OUTPUT_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


# Loading Dataset
b_url = f'{DATA_FILES}/'
raw_datasets = load_dataset("csv", data_files={'train': f'{b_url}train.csv', 'eval': f'{b_url}eval.csv', 'test': f'{b_url}test.csv'})
raw_datasets = raw_datasets.shuffle(seed=RANDOM_SEED)

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets

***
### Testing
***

In [None]:
# For qualitative evaluation
index = 7
input_ids = tokenizer(tokenized_datasets['test'][index]['input_seq'], return_tensors="pt").input_ids
output_encodings = model.generate(input_ids, num_beams=1, do_sample=False, max_length=MAX_OUTPUT_LEN)
output = tokenizer.decode(output_encodings[0])
print('-'*50)
print('Input Sentence:')
print(tokenized_datasets['test'][index]['input_seq'])
print('-'*50)
print('GT Output Sentence:')
print(tokenized_datasets['test'][index]['response'])
print('-'*50)
print('Output Sentence:')
print(output)

In [None]:
# Example for inferencing below
encodings = tokenizer.encode('<THEM> Hello may I please have 15 firewood and 3 water', max_length=MAX_INPUT_LEN, truncation=True)
tokenizer.decode(model.generate(torch.tensor([encodings], num_beams=1, do_sample=False, max_length=MAX_OUTPUT_LEN).to(torch.device('cpu')))[0])

In [None]:
# EXAMPLES FOR DECODING METHODS
index = 500

input_ids = tokenizer(tokenized_datasets['test'][index]['input_seq'], return_tensors="pt", max_length=MAX_OUTPUT_LEN, truncation=True).input_ids

print('Input Sentence:')
print(tokenized_datasets['test'][index]['input_seq'])

print('-'*50)
print('GT Output Sentence:')
print(tokenized_datasets['test'][index]['response'])

print('-'*50)
output_encodings = model.generate(input_ids, num_beams=1, do_sample=False, max_new_tokens=MAX_OUTPUT_LEN)
output = tokenizer.decode(output_encodings[0], max_length=MAX_OUTPUT_LEN, truncation=True)
print('Output Sentence with greedy decoding (default):')
print(output)
print(f'Length: {len(output.split())}')

print('-'*50)
output_encodings = model.generate(input_ids, num_beams=10, do_sample=False, max_new_tokens=MAX_OUTPUT_LEN)
output = tokenizer.decode(output_encodings[0], max_length=MAX_OUTPUT_LEN, truncation=True)
print('Output Sentence with 10-beam search:')
print(output)
print(f'Length: {len(output.split())}')

print('-'*50)
output_encodings = model.generate(input_ids, num_beams=1, do_sample=True, max_new_tokens=MAX_OUTPUT_LEN)
output = tokenizer.decode(output_encodings[0], max_length=MAX_OUTPUT_LEN, truncation=True)
print('Output Sentence with multinomial sampling:')
print(output)
print(f'Length: {len(output.split())}')

print('-'*50)
output_encodings = model.generate(input_ids, num_beams=10, do_sample=True, max_new_tokens=MAX_OUTPUT_LEN)
output = tokenizer.decode(output_encodings[0], max_length=MAX_OUTPUT_LEN, truncation=True)
print('Output Sentence with 10-beam search multinomial sampling')
print(output)
# print(f'Length: {len(output.split())}')