In [1]:
from accelerate.utils import BnbQuantizationConfig
from accelerate import Accelerator, notebook_launcher
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, \
                        get_cosine_schedule_with_warmup, set_seed
import transformers
import optimum

from datasets import load_dataset,Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, \
                        get_cosine_schedule_with_warmup, set_seed
from peft import LoraConfig, TaskType, get_peft_model
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
import logging
from torch.utils.data import DataLoader
from torch.optim import AdamW, SGD
from tqdm.notebook import tqdm
import torch
from torch.nn.utils.rnn import pad_sequence
import glob
from collections import OrderedDict
import re

import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import datetime
start_time = datetime.datetime.now()

In [3]:

def truncate_txt(text, length):
    text_list = text.split()
    
    if len(text_list) <= length:
        return text
    
    return " ".join(text_list[:length])


def gen_prompt(og_text, rewritten_text, truncate_length=200):
    
    # Truncate the texts to first 200 words for now
    # As we are having memory issues on Mixtral8x7b
    og_text = truncate_txt(og_text, truncate_length)
    rewritten_text = truncate_txt(rewritten_text, truncate_length)
    
    return f"""
You are given 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.
Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.
Start directly with the prompt, output should be one line only.

Original Essay:
\"""{og_text}\"""

Rewritten Essay:
\"""{rewritten_text}\"""

""".strip()

def formatting_func(example):
  output_texts = []
  for i in range(len(example['original_text'])):
    prompt = tokenizer.apply_chat_template(
      [{
        'role': 'user',
        'content' : gen_prompt(example['original_text'][i], example['rewritten_text'][i]) + '\n ',
      }],
      tokenize=False,
    )
    text = f"{prompt}{example['rewrite_prompt'][i]}{tokenizer.eos_token}"
    output_texts.append(text)
  return output_texts

In [4]:
# MODEL_PATH = "/kaggle/input/gemma/transformers/7b-it/2"
# MODEL_PATH = "distilbert/distilgpt2"
# MODEL_PATH = "gpt2"
# MODEL_PATH = "microsoft/phi-2"
# MODEL_PATH = "distilbert/distilroberta-base"
MODEL_PATH = 'mistralai/Mistral-7B-Instruct-v0.2'
# MODEL_PATH = "google/gemma-2b-it"
# MODEL_PATH = "google/flan-t5-small"
# MODEL_PATH = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
# MODEL_PATH = "/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1"
# MODEL_PATH = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"
# MODEL_PATH = "/kaggle/input/llama-2/pytorch/13b-chat-hf/1"

max_length = 1024

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.model_max_length = max_length


In [5]:
def create_data(df, tokenizer, split='train'):
  data = Dataset.from_pandas(df[[
    # 'reverse_prompt',
    'rewrite_prompt', 'original_text', 'rewritten_text'
  ]],split=split)

  
  data = data.map(lambda samples: tokenizer(samples["original_text"], max_length=max_length, truncation=True), batched=True)
  data = data.map(lambda samples: tokenizer(samples["rewritten_text"], max_length=max_length, truncation=True), batched=True)
  data = data.map(lambda samples: tokenizer(samples["rewrite_prompt"], max_length=max_length, truncation=True), batched=True)
  return data

train, test = (
  create_data(pd.read_csv('./input/gemma-rewrite-nbroad/train.csv'), tokenizer, 'train'),
  create_data(pd.read_csv('./input/gemma-rewrite-nbroad/test.csv'), tokenizer, 'test'),
)
print(len(train), len(test))

Map: 100%|██████████| 4109/4109 [00:04<00:00, 963.91 examples/s] 
Map: 100%|██████████| 4109/4109 [00:02<00:00, 1444.59 examples/s]
Map: 100%|██████████| 4109/4109 [00:00<00:00, 16000.57 examples/s]
Map: 100%|██████████| 457/457 [00:00<00:00, 1117.79 examples/s]
Map: 100%|██████████| 457/457 [00:00<00:00, 2015.79 examples/s]
Map: 100%|██████████| 457/457 [00:00<00:00, 15416.64 examples/s]

4109 457





In [6]:
lora_target_modules_dict = {
  'gpt2': ['c_attn'],
  'distilbert/distilgpt2': ['c_attn'],
  'distilbert/distilroberta-base': ['query', 'key', 'value'],
}
import json
os.makedirs('./settings', exist_ok=True)
json.dump(lora_target_modules_dict, open('./settings/lora_target_modules.json', 'w'))

In [7]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.pad_token_id)

2


In [8]:
len(test)

457

In [9]:

import os

batch_size = 3
grad_accumulation_steps = 64
num_epochs = 2000
lr = 5e-5
checkpointing_steps = 500
save_path = os.path.join('./working/trained_models/', MODEL_PATH)
r = 128
lora_alpha = 128
lora_dropout = 0.05

# If ckpt_path is a real path (os.path.isfile(ckpt_path) is True),
# then the checkpoint will be loaded
ckpt_path = os.path.join(save_path, 'checkpoint.pth')
print(ckpt_path)
kwargs = {
'batch_size':batch_size, 'num_epochs':num_epochs, 'lr':lr, 'grad_accumulation_steps':grad_accumulation_steps, 
'checkpointing_steps':checkpointing_steps, 'save_path':save_path, 'ckpt_path':ckpt_path, 'r':r, 'lora_alpha':lora_alpha, 'lora_dropout':lora_dropout
}


./working/trained_models/mistralai/Mistral-7B-Instruct-v0.2/checkpoint.pth


In [10]:
def chat(prompt, max_new_token=256, **generate_args):

  messages = [
      {
          "role": "user",
          "content": prompt + '\n ',
      }
  ]
  encoded_input = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to('cuda')
  # print(encoded_input.shape)
  # print(torch.max(encoded_input), tokenizer.vocab_size)
  with torch.no_grad():
      encoded_output = model.generate(encoded_input, max_new_tokens=max_new_token, do_sample=True, pad_token_id=tokenizer.eos_token_id,
                                      **generate_args)
  
  decoded_output = tokenizer.batch_decode(encoded_output, skip_special_tokens=False)[0]
  return decoded_output
  decoded_output = re.sub(r"[\s\S]*\[\/INST\]", '', decoded_output, 1).replace(prompt, '')
  
  return decoded_output

In [25]:
i = 20
# prompt = (f"Give me the prompt used to have the rewritten text from the original text.\nORIGINAL TEXT:\n{data['original_text'][i]}\n\nREWRITTEN TEXT:\n{data['rewritten_text'][i]}\n\nPROMPT:\n")
prompt = gen_prompt(test['original_text'][i], test['rewritten_text'][i])
len(tokenizer.encode(prompt))

626

In [26]:
print(prompt)

You are given 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.
Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.
Start directly with the prompt, output should be one line only.

Original Essay:
"""I would n't say I was in love with Brenna. While she certainly was n't ugly she was n't the most beautiful girl. She was a little thicker and dressed like an old lamp shade in grandma's basement on her best days, but we'd known each other since 2nd grade and she was more or less one of the guys. It all started when my parents were out of town. I decided to have a few friends over. We were going to play some COD and Assassins Creed and maybe some FIFA and Madden. It was most just going be me hanging out with Mike and Trey and Brenna as well as one friend they could bring, as per our usual rules. Unfortunately Mike and his big mout

In [27]:
print(test['rewrite_prompt'][i])

Write like Maya Angelou: Infuse the essay with the lyrical and profound voice of Maya Angelou, reflecting on human dignity and resilience.


In [30]:
torch.cuda.empty_cache()
import gc
gc.collect(2)
try: del model
except: pass

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    # bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, 
                                                 quantization_config=quantization_config, 
                                                 torch_dtype=torch.float16
                                                 )
res = chat(prompt, top_k=5)
print(res)


`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:14<00:00,  4.89s/it]


<s> [INST] You are given 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.
Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.
Start directly with the prompt, output should be one line only.

Original Essay:
"""I would n't say I was in love with Brenna. While she certainly was n't ugly she was n't the most beautiful girl. She was a little thicker and dressed like an old lamp shade in grandma's basement on her best days, but we'd known each other since 2nd grade and she was more or less one of the guys. It all started when my parents were out of town. I decided to have a few friends over. We were going to play some COD and Assassins Creed and maybe some FIFA and Madden. It was most just going be me hanging out with Mike and Trey and Brenna as well as one friend they could bring, as per our usual rules. Unfortunately Mike and h

In [29]:
torch.cuda.empty_cache()
import gc
gc.collect(2)
try: del model
except: pass

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    # bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(save_path, 
                                                 quantization_config=quantization_config, 
                                                 torch_dtype=torch.float16
                                                 )
res = chat(prompt, max_new_token=128, top_k=5)
if 'Prompt:' in res:
  print(res.split('Prompt:')[1])
elif '[/INST]' in res:
  print(res.split('[/INST]')[1].strip().split('\n')[0])
else:
  print(res.strip().split('\n')[0])

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:14<00:00,  4.88s/it]


 Rewrite the essay as a poem, focusing on the themes of human connection, dignity, and resilience, using vivid imagery and metaphors. Use the phrases "tapestry of life" and "threads intertwine" to connect the different parts of the story.</s>


In [16]:
res

'<s> [INST] You are given 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.\nAnalyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.\nStart directly with the prompt, output should be one line only.\n\nOriginal Essay:\n"""Everything was perfect. This was going to be the night that everything would be moving in the right direction. It just had n\'t been the same since I met her four years ago, but I had found another that I had told myself was even better. In a short time, I would finally make the commitment that I had been afraid to for too long. The room had been cleaned and arranged perfectly. I breathe in deeply and take in what my hands had created. It was n\'t much, but it was the absolute best that I could do. I was so anxious about everything going right, I wrote down what I would say when the opportunity showed itself. I s

In [17]:
torch.cuda.empty_cache()

In [None]:
import time
time.sleep(1000)

KeyboardInterrupt: 

# Test


In [17]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F

t5 = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [18]:
def calc_score(label, pred):
  label_embed = t5.encode(label)
  pred_embed = t5.encode(pred)

  def sharp_cosine_similarity(emb1, emb2, p=3.0):
    cos_sim = F.cosine_similarity(emb1, emb2)
    return torch.sign(cos_sim) * (torch.abs(cos_sim) + 1e-8) ** p

  return sharp_cosine_similarity(torch.tensor(label_embed), torch.tensor(pred_embed)).mean().item()

In [21]:
mean_prompt = 'Please improve the following text, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.' 

In [33]:
calc_score(
  ['Write like Maya Angelou: Infuse the essay with the lyrical and profound voice of Maya Angelou, reflecting on human dignity and resilience.'],
  ['Rewrite the essay as a poem, focusing on the themes of human connection, dignity, and resilience, using vivid imagery and metaphors. Use the phrases "tapestry of life" and "threads intertwine" to connect the different parts of the story.'],
)

0.6632662415504456

In [34]:
calc_score(
  ['Write like Maya Angelou: Infuse the essay with the lyrical and profound voice of Maya Angelou, reflecting on human dignity and resilience.'],
  ['Transform the original essay into a captivating story that showcases the beauty of human connections and the unexpected turns fate takes in shaping our destinies.'],
)

0.5592873096466064

In [35]:
calc_score(
  ['Write like Maya Angelou: Infuse the essay with the lyrical and profound voice of Maya Angelou, reflecting on human dignity and resilience.'],
  [mean_prompt],
)

0.5713196396827698

In [None]:
tdf = test.to_pandas()

In [None]:
tdf = tdf.iloc[:50]

In [None]:
calc_score(tdf['rewrite_prompt'], tdf['mean_prompt'])

0.5065957903862

In [None]:
tdf['mean_prompt'] = mean_prompt

In [None]:
import gc

try: del model
except: pass
torch.cuda.empty_cache()
gc.collect(2)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    # bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(save_path, 
                                                 quantization_config=quantization_config, 
                                                 torch_dtype=torch.float16
                                                 )

In [None]:
from tqdm import tqdm

k = 20
tdf[f'pred_top_{k}'] = mean_prompt
for i in tqdm(range(len(tdf))):
  prompt = gen_prompt(tdf['original_text'][i], tdf['rewritten_text'][i])
  # print(prompt)
  res = chat(prompt, max_new_token=128, top_k = k)
  res = res.strip().split('\n')[0]
  tdf.loc[i, f'pred_top_{k}'] = res
  # if i > 3: break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tdf[f'pred_top_{k}'] = mean_prompt
100%|██████████| 50/50 [17:49<00:00, 21.39s/it]


In [None]:
calc_score(tdf['rewrite_prompt'], tdf['pred_top_20'])

0.5963925123214722