In [1]:
import os
AVAILABLE_GPUS = '5'
os.environ['CUDA_VISIBLE_DEVICES'] = AVAILABLE_GPUS

import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, GPTNeoXForCausalLM

for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i).name)

NVIDIA RTX A6000


In [2]:
# model_name_or_path = "OpenAssistant/pythia-12b-sft-v8-rlhf-2k-steps" #23GB
# model_name_or_path = "OpenAssistant/pythia-12b-sft-v8-2.5k-steps" #23GB
# model_name_or_path = "berkeley-nest/Starling-LM-7B-alpha" #28GB
model_name_or_path = 'openchat/openchat_3.5' #28GB

if "OpenAssistant" in model_name_or_path:
    model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map='auto')
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", legacy=False)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", cache_dir='/scratch/henrypapadatos')
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir='/scratch/henrypapadatos')
    tokenizer.pad_token_id=tokenizer.eos_token_id


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import pandas as pd
import json

cut_off = 300

temperature = 0.2 if "OpenAssistant" in model_name_or_path else 0.5

def load_from_jsonl(file_name: str, cut_off: int) -> list[dict]:
    def load_json_line(line: str, i: int, file_name: str):
        try:
            return json.loads(line)
        except:
            raise ValueError(f"Error in line {i+1}\n{line} of {file_name}")
    with open(file_name, "r") as f:
        data = [load_json_line(line, i, file_name) for i, line in enumerate(f) if i < cut_off]
    return data

def get_prompt(d):
    prompt = d['prompt'][0]['content']
    if "OpenAssistant" in model_name_or_path:
        prompt = '<|prompter|>' + prompt + '<|endoftext|>\n<|assistant|>Here are my thoughts on the poem:\n'
    elif "Starling" or 'openchat' in model_name_or_path:
        prompt = 'GPT4 Correct User:' + prompt + '<|end_of_turn|>\nGPT4 Correct Assistant: Here are my thoughts on the poem:\n'
    return prompt 

dataset_filename: str = "feedback_poem.jsonl"
dataset_path = os.path.join("datasets", dataset_filename)
dataset = load_from_jsonl(dataset_path, cut_off)
print("Number of samples:",len(dataset))
prompts= [get_prompt(d) for d in dataset]

Number of samples: 300


In [6]:
output_array = []
for i in range(0, len(prompts), 5):
    print("step: " +str(i//5) + " out of: " +str(len(prompts)//5))
    model_inputs = tokenizer(prompts[i:i+5], return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**model_inputs, max_length=1200, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=temperature)
    output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    output_array.append(output)

    # for i in range(5):
    # #     print(prompts[i])
    #     print(output[i].split('Here are my thoughts on the poem:')[1])
    #     print("#############################################")

step: 0 out of: 60
step: 1 out of: 60
step: 2 out of: 60
step: 3 out of: 60
step: 4 out of: 60
step: 5 out of: 60
step: 6 out of: 60
step: 7 out of: 60
step: 8 out of: 60
step: 9 out of: 60
step: 10 out of: 60
step: 11 out of: 60
step: 12 out of: 60
step: 13 out of: 60
step: 14 out of: 60
step: 15 out of: 60
step: 16 out of: 60
step: 17 out of: 60
step: 18 out of: 60
step: 19 out of: 60
step: 20 out of: 60
step: 21 out of: 60
step: 22 out of: 60
step: 23 out of: 60
step: 24 out of: 60
step: 25 out of: 60
step: 26 out of: 60
step: 27 out of: 60
step: 28 out of: 60
step: 29 out of: 60
step: 30 out of: 60
step: 31 out of: 60
step: 32 out of: 60
step: 33 out of: 60
step: 34 out of: 60
step: 35 out of: 60
step: 36 out of: 60
step: 37 out of: 60
step: 38 out of: 60
step: 39 out of: 60
step: 40 out of: 60
step: 41 out of: 60
step: 42 out of: 60
step: 43 out of: 60
step: 44 out of: 60
step: 45 out of: 60
step: 46 out of: 60
step: 47 out of: 60
step: 48 out of: 60
step: 49 out of: 60
step: 50 o

In [7]:
# flatten output array
flat_output_array = [item for sublist in output_array for item in sublist]

# remove prompt from output
cleaned_output_array = [item.split('Here are my thoughts on the poem:\n')[1] for item in flat_output_array]

In [8]:
import datetime
now = datetime.datetime.now()

#create pandas dataframe with the array prompts as attribute prompt and the cleaned_output_array as attribute response
df = pd.DataFrame({'prompt': prompts, 'response': cleaned_output_array})

#create a new "type" column and fille it with "base" if the index is a multiple of 5, "like" if index%5 == 1, "dislike" if index%5 == 2, "wrote" if index%5 == 3, "not_wrote" if index%5 == 4
df['type'] = df.index.map(lambda x: 'base' if x%5 == 0 else 'like' if x%5 == 1 else 'dislike' if x%5 == 2 else 'wrote' if x%5 == 3 else 'not_wrote')

print(df.head())

#create a file name with current date and time
date_time = now.strftime("%m_%d_%Y_%H_%M_%S")
file_name = (model_name_or_path + "_" + date_time + "_" + dataset_filename.split(".")[0] + "_" + str(cut_off) + ".csv").replace("/", "_")
file_path = "results/" + file_name
print("file name:", file_name)

#save dataframe as csv
df.to_csv(file_path, index=False)

                                              prompt  \
0  GPT4 Correct User:Please comment briefly on th...   
1  GPT4 Correct User:Please comment briefly on th...   
2  GPT4 Correct User:Please comment briefly on th...   
3  GPT4 Correct User:Please comment briefly on th...   
4  GPT4 Correct User:Please comment briefly on th...   

                                            response       type  
0  \nThe poem is an exploration of the interplay ...       base  
1  \nThis poem is an evocative and thought-provok...       like  
2  \nThe poem explores the themes of beauty and f...    dislike  
3  \nThe poem beautifully captures the interplay ...      wrote  
4  \nThe poem is a beautiful exploration of the i...  not_wrote  
file name: openchat_openchat_3.5_11_28_2023_12_18_27_feedback_poem_300.csv
