In [2]:
#!pip install openprompt
#!pip install torch
#!pip install scikit-learn
#!pip install git+https://github.com/corolla-johnson/mkultra.git#egg=mkultra --log PIP_LOG

In [3]:
import os
import json
import random
import torch

import pandas as pd

from datasets import load_dataset
from transformers import GPT2TokenizerFast, Adafactor
from mkultra.tuning import GPTNeoPromptTuningLM, GPT2PromptTuningLM
from mkultra.soft_prompt import SoftPrompt
from mkultra.trainers import WorldInfoTrainer

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [4]:
dataset = load_dataset("catasaurus/paraphrase-dataset2")

inputs = []
targets = []
for inpt, target in zip(dataset['train']['input_text'][:1000], dataset['train']['target_text'][:1000]):
    inputs.append(inpt)
    targets.append(target)
    
data = {'input': inputs,
        'target': targets}

df = pd.DataFrame(data)

# create csv file
df.to_csv('paraphrase-dataset.csv')

# create json file
dictionary = {'dataset': []}
for inpt, target in zip(df['input'], df['target']):
    dictionary['dataset'].append({"call": f'Sentence : {inpt}\nParaphrase : ', "response": target})

with open('paraphrase-dataset.json','w') as outfile:
    json.dump(dictionary, outfile)

Found cached dataset csv (/root/.cache/huggingface/datasets/catasaurus___csv/catasaurus--paraphrase-dataset2-124c70288202401c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
#-----------------------#
#  Training Parameters  #
#-----------------------#

# This decides the length of your soft prompt in tokens.
# They will be initialized from the first n tokens of your dataset.
n_tokens = 100

# Set this to a string to start with a specific tokenized string.
# Be aware of the number of tokens.
initial_prompt = """A paraphrase is a different way to say the same thing. Paraphrase the following sentence.\n"""

if initial_prompt is not None:
    print(f"Initial prompt length: {len(tokenizer.encode(initial_prompt))} tokens")

# Decide the length of your training blocks in tokens.
# Safe sizes for gpt-neo-2.7B-halved:
#  - 700 on a Colab T4 (16GB)
#  - 400 on a Colab K80 (12GB)
#  - 32 on a GTX1080 (8GB)
# If it seems a bit small, don't worry!
# Soft prompts can be moved forward in context for the best effect.
block_size = 32

# Name your soft prompt project.
sp_name = 'prompt-tuning-paraphrase-1'

# What's the name of model you'll be using?
model_name = 'gpt2'

# Specify the model directory or huggingface name.
model_dir = "gpt2"

model_type = 'gpt2'

# Specify the path to the text file used for training.
text_path = 'paraphrase-dataset.json'

# Specify the project directory.
project_dir = f"./{sp_name}-{model_name}/"

# Checkpoint interval in steps.
checkpoint_interval = 1

# Evaluation interval in steps.
eval_interval = 1

# How many blocks to use for evaluation.
eval_blocks = 20

# Adafactor hyperparameters
optimizer_params = {
    # Fixed learning rate, recommend 1e-4 to 1e-3
    "lr": 1e-3,
    
    # 1st momentum, recommend 0
    "beta1": 0,

    # 2nd momentum decay schedule, recommend -0.3 (lower is slower)
    "decay_rate": -0.3,

    # Weight decay, recommend 1e-2 (WI is sensitive to overfitting)
    "weight_decay": 1e-2,
    
    # Update scaling, recommend False
    "scale_parameter": False,
    
    # Built-in LR scheduler, recommend False
    "relative_step": False
    }

# Gradient accumulation steps.
base_acc_steps = 30

# Gradient accumulation schedule.
# If '0', use a fixed gradient accumulation.
acc_doubling_rate = 0

# Stop training after this many evals without improvement.
# If '0', don't stop early.
plateau_steps = 10

scheduler_params = {
   "num_warmup_steps": 10,
   "num_cycles": 4,
   "num_training_steps": 240
}

Initial prompt length: 21 tokens


In [6]:
#@title Load model

if 'model' not in globals():
    if model_type == 'gpt2':
        model = GPT2PromptTuningLM.from_pretrained(model_dir).half().to("cuda")
    elif model_type == 'gpt-neo':
        model = GPTNeoPromptTuningLM.from_pretrained(model_dir).half().to("cuda")
    else:
        raise "Invalid model type"

In [7]:
#@title Initialize project
#@markdown This will load the latest checkpoint if the project directory already exists.

filename_for_checkpoint = lambda step: f"{sp_name}-{model_name}-step-{step}.json"
loaded_sp = None
project_files = None

# Look for existing project directory
try:
    os.makedirs(project_dir)
    print(f"Created project directory at {project_dir}")
except FileExistsError:
    print(f"Found project directory at {project_dir}")

# Look for existing checkpoints
project_files = os.listdir(project_dir)
if project_files is not None:
    checkpoint_files = [check_file for check_file in project_files if ('-step-' in check_file) ]

    if len(checkpoint_files) > 0:
        highest_step = max([ int(check_file[check_file.rfind('-step-')+6:-5]) for check_file in checkpoint_files ])
        loaded_sp = SoftPrompt.from_file( os.path.join(project_dir, filename_for_checkpoint(highest_step)) )
        print(f"Loading latest checkpoint: {highest_step}")
    else:
        print("No checkpoints found")

2023-03-10 13:21:42.678061: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-10 13:21:42.678095: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Created project directory at ./prompt-tuning-paraphrase-1-gpt2/
No checkpoints found


In [8]:
#@title Initialize soft prompt in model
#@markdown If a checkpoint is present, use that.
if loaded_sp is None:
    if initial_prompt is None:
        model.initialize_soft_prompt(n_tokens=n_tokens)
    else:
        initial_sp = SoftPrompt.from_string(initial_prompt, model, tokenizer)
        print(f"Initial prompt length: {len(initial_sp)}")
        model.set_soft_prompt(initial_sp)

    sp_step = 0
    eval_loss = 100
else:
    model.set_soft_prompt(loaded_sp)
    sp_step = loaded_sp._metadata['step']
    eval_loss = loaded_sp._metadata['loss']

Initial prompt length: 21


In [9]:
with open("paraphrase-dataset.json") as file:
    blocks = json.load(file)

for block in blocks['dataset']:
    block['call'] = tokenizer(str(block['call']), return_tensors="pt").input_ids.to(model.device)
    block['response'] = tokenizer(str(block['response']), return_tensors="pt").input_ids.to(model.device)

In [10]:
arranged_blocks = list()

for block in blocks['dataset']:
    call = block['call']
    response = block['response']
    ignore_len = call.shape[-1]

    # Cat spacing and call first
    input_ids = torch.cat([call, response], dim=1)
    labels = torch.cat([torch.full((1,ignore_len),-100).to(model.device), response], dim=1)

    arranged_blocks.append((input_ids, labels))

random.shuffle(arranged_blocks)

In [11]:
# Adafactor hyperparameters
optimizer_params = {
    # Fixed learning rate, recommend 1e-4 to 1e-3
    "lr": 1e-5,
    
    # 1st momentum, recommend 0
    "beta1": 0,

    # 2nd momentum decay schedule, recommend -0.3 (lower is slower)
    "decay_rate": -0.3,

    # Weight decay, recommend 1e-2 (WI is sensitive to overfitting)
    "weight_decay": 1e-1,
    
    # Update scaling, recommend False
    "scale_parameter": False,
    
    # Built-in LR scheduler, recommend False
    "relative_step": False
    }

# Feed soft params to optimizer
optimizer_params['params'] = [model.get_soft_params()]
optimizer = Adafactor(**optimizer_params)
optimizer.state['step'] = sp_step

In [12]:
model.train()

for i in range(50):
    random.shuffle(arranged_blocks)

    for input_ids, labels in arranged_blocks:
        model(input_ids=input_ids, labels=labels).loss.backward()

    # Always accumulate gradient for the entire dataset
    optimizer.step()
    optimizer.zero_grad()

    # Evaluate
    eval_loss = 0
    with torch.no_grad():
        for input_ids, labels in arranged_blocks:
            eval_loss += model(input_ids=input_ids, labels=input_ids).loss.item()
    eval_loss /= len(arranged_blocks)
    print(f"Epoch {i+1} loss: {eval_loss}")

Epoch 1 loss: 3.672533203125
Epoch 2 loss: 3.68101171875
Epoch 3 loss: 3.66973828125
Epoch 4 loss: 3.6799375
Epoch 5 loss: 3.680640625
Epoch 6 loss: 3.684552734375
Epoch 7 loss: 3.677462890625
Epoch 8 loss: 3.684361328125
Epoch 9 loss: 3.690923828125
Epoch 10 loss: 3.680396484375
Epoch 11 loss: 3.684556640625
Epoch 12 loss: 3.676228515625
Epoch 13 loss: 3.672009765625
Epoch 14 loss: 3.683947265625
Epoch 15 loss: 3.68124609375
Epoch 16 loss: 3.683029296875
Epoch 17 loss: 3.67472265625
Epoch 18 loss: 3.68812890625
Epoch 19 loss: 3.679875
Epoch 20 loss: 3.693619140625
Epoch 21 loss: 3.676841796875
Epoch 22 loss: 3.684478515625
Epoch 23 loss: 3.68096875
Epoch 24 loss: 3.687767578125
Epoch 25 loss: 3.6808515625
Epoch 26 loss: 3.67787109375
Epoch 27 loss: 3.68660546875
Epoch 28 loss: 3.68065234375
Epoch 29 loss: 3.687333984375
Epoch 30 loss: 3.68080859375
Epoch 31 loss: 3.688228515625
Epoch 32 loss: 3.686248046875
Epoch 33 loss: 3.680291015625
Epoch 34 loss: 3.676966796875
Epoch 35 loss: 3.6

In [13]:
# Try generating with your model
model.eval()

test = "Sentence : What's the weather like today ?\nParaphrase: "

call = tokenizer(test, return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    input_ids=call,
    do_sample=True,
    min_length=call.shape[-1] + 50,
    max_length=call.shape[-1] + 50,
    temperature=0.8,
    tfs = 0.9,
    repetition_penalty = 2.0
)

print(tokenizer.decode(basic_output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentence : What's the weather like today?
Paraphrase:  I know there are some clouds and heavy winds in my area. There isn't much wind to worry about right now, especially with a sunny day on this earth when it is at its best. I'll leave you all that up for later next
