In [16]:
!pip install --upgrade transformers datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
from datasets import load_dataset

raw_SODA = load_dataset("allenai/soda")

In [18]:
# A filtering function, for parsing out solo conversations and triple conversations.
def duo_conversations_only(dataset):
  has_PersonX = dataset["PersonX"] is not None and dataset["PersonX"] != ""
  has_PersonY = dataset["PersonY"] is not None and dataset["PersonY"] != ""
  has_PersonZ = dataset["PersonZ"] is not None and dataset["PersonZ"] != ""

  return has_PersonX and has_PersonY and not has_PersonZ

duo_SODA = raw_SODA.filter(duo_conversations_only)

duo_SODA["train"]

Dataset({
    features: ['head', 'relation', 'tail', 'literal', 'narrative', 'dialogue', 'speakers', 'PersonX', 'PersonY', 'PersonZ', 'original_index', 'split', 'head_answer', 'pmi_head_answer', 'relation_tail_answer', 'pmi_relation_tail_answer'],
    num_rows: 348572
})

In [19]:
raw_SODA_train = duo_SODA["train"]
raw_SODA_valid = duo_SODA["validation"]
raw_SODA_test = duo_SODA["test"]

# TRAIN SET
narrative_train = raw_SODA_train["narrative"]
PersonX_train = raw_SODA_train["PersonX"]
PersonY_train = raw_SODA_train["PersonY"]
dialogue_train = raw_SODA_train["dialogue"]

# TEST SET
narrative_test = raw_SODA_test["narrative"]
PersonX_test = raw_SODA_test["PersonX"]
PersonY_test = raw_SODA_test["PersonY"]
dialogue_test = raw_SODA_test["dialogue"]

# VALIDATION SET
narrative_valid = raw_SODA_valid["narrative"]
PersonX_valid = raw_SODA_valid["PersonX"]
PersonY_valid = raw_SODA_valid["PersonY"]
dialogue_valid = raw_SODA_valid["dialogue"]

In [20]:
def imagineTagMaker(PersonX, PersonY):
  imagineTag = "You are " + PersonY + " talking to " + PersonX +"."
  return imagineTag

# TRAINING SET
imagine_train = []
for i in range(len(PersonX_train)):
  imagine_train.append(imagineTagMaker(PersonX_train[i], PersonY_train[i]))

# TEST SET
imagine_test = []
for i in range(len(PersonX_test)):
  imagine_test.append(imagineTagMaker(PersonX_test[i], PersonY_test[i]))

# VALIDATION SET
imagine_valid = []
for i in range(len(PersonX_valid)):
  imagine_valid.append(imagineTagMaker(PersonX_valid[i], PersonY_valid[i]))

In [21]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cu118


In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small", model_max_length=128)

In [33]:
import torch

def createInputSet(imagineSet, narrativeSet, dialogueSet, max_seq_length=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_inputs = []

    for i in range(len(imagineSet)):
        # Process the first round of dialogue
        rawString = "dialogue: " + imagineSet[i] + " <SEP> " + narrativeSet[i] + " <SEP> " + dialogueSet[i][0] + " <TURN> "
        tokenizedString = tokenizer(rawString, padding='max_length', truncation=True, max_length=max_seq_length)
        with tokenizer.as_target_tokenizer():
            tokenizedResponse = tokenizer(dialogueSet[i][1], padding='max_length', truncation=True, max_length=max_seq_length)
        tokenizedString["labels"] = tokenizedResponse["input_ids"]
        model_inputs.append(tokenizedString)

        # Process the second round of dialogue
        if len(dialogueSet[i]) >= 4:
            rawString += " <TURN> ".join(dialogueSet[i][1:4]) + " <TURN> "
            tokenizedString = tokenizer(rawString, padding='max_length', truncation=True, max_length=max_seq_length)
            with tokenizer.as_target_tokenizer():
                tokenizedResponse = tokenizer(dialogueSet[i][3], padding='max_length', truncation=True, max_length=max_seq_length)
            tokenizedString["labels"] = tokenizedResponse["input_ids"]
            model_inputs.append(tokenizedString)

    return model_inputs

model_train = createInputSet(imagine_train, narrative_train, dialogue_train, max_seq_length=128)
model_eval = createInputSet(imagine_valid, narrative_valid, dialogue_valid, max_seq_length=128)


In [34]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [35]:
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from pathlib import Path  # Import Path to handle file paths

args = Seq2SeqTrainingArguments(
    output_dir=Path("/kaggle/working") / "your_desired_directory",  # Set the output directory to your desired location
    evaluation_strategy="epoch",
    learning_rate=0.001,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    weight_decay=0.01,
    save_total_limit=15,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=model_train,
    eval_dataset=model_eval,
)




In [36]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.2226,0.202152
2,0.2013,0.187056
3,0.1909,0.180402


TrainOutput(global_step=52287, training_loss=0.21764291513099987, metrics={'train_runtime': 21861.0916, 'train_samples_per_second': 95.669, 'train_steps_per_second': 2.392, 'total_flos': 7.076454363404698e+16, 'train_loss': 0.21764291513099987, 'epoch': 3.0})

In [91]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_checkpoint_path = "/kaggle/working/your_desired_directory/checkpoint-52000"

tokenizer = AutoTokenizer.from_pretrained("t5-small") 
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_path)

test_input = "I just feel so stuck in my job."
input_ids = tokenizer.encode(test_input, return_tensors="pt")

output = model.generate(input_ids)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("Model Output:", decoded_output)


Model Output: That sounds really tough. Do you want to talk about what's been going on?
