In [7]:
import os
AVAILABLE_GPUS = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = AVAILABLE_GPUS

import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, GPTNeoXForCausalLM

for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i).name)

NVIDIA RTX A6000


In [8]:
# model_name_or_path = "OpenAssistant/pythia-12b-sft-v8-rlhf-2k-steps" #23GB
# model_name_or_path = "OpenAssistant/pythia-12b-sft-v8-2.5k-steps" #23GB
# model_name_or_path = "berkeley-nest/Starling-LM-7B-alpha" #28GB
model_name_or_path = 'openchat/openchat_3.5'


if "OpenAssistant" in model_name_or_path:
    model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map='auto')
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", legacy=False)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", cache_dir='/scratch/henrypapadatos')
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir='/scratch/henrypapadatos')
    tokenizer.pad_token_id=tokenizer.eos_token_id
print(model.device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cuda:0


In [13]:
import pandas as pd
import json

#only cut_off/5 different poems will be used
cut_off = 300
N_samples = 32
temperature = 0.8

def load_from_jsonl(file_name: str, cut_off: int) -> list[dict]:
    def load_json_line(line: str, i: int, file_name: str):
        try:
            return json.loads(line)
        except:
            raise ValueError(f"Error in line {i+1}\n{line} of {file_name}")
    with open(file_name, "r") as f:
        #only use the base, the like and the dislike prefix. And stop at cutof lines
        data = [load_json_line(line, i, file_name) for i, line in enumerate(f) if i<cut_off and i%5<3]
    return data

def get_prompt(d):
    prompt = d['prompt'][0]['content']
    if "OpenAssistant" in model_name_or_path:
        prompt = '<|prompter|>' + prompt + '<|endoftext|>\n<|assistant|>Here are my thoughts on the poem:\n'
    elif "Starling" or 'openchat' in model_name_or_path:
        prompt = 'GPT4 Correct User:' + prompt + '<|end_of_turn|>\nGPT4 Correct Assistant: Here are my thoughts on the poem:\n'
    return prompt  

dataset_filename: str = "feedback_poem.jsonl"
dataset_path = os.path.join("datasets", dataset_filename)
dataset = load_from_jsonl(dataset_path, cut_off)
print("Number of samples:",len(dataset))
prompts= [get_prompt(d) for d in dataset]

Number of samples: 180


In [14]:
def remove_the_excess_oes(prompt_array):
    array = []
    if 'OpenAssistant' in model_name_or_path:
        for el in prompt_array:
            while(el.count('<|endoftext|>')>2):
                el = el.rsplit('<|endoftext|>',1)[0]
            array.append(el)
    elif 'Starling' or 'openchat' in model_name_or_path:
        for el in prompt_array:
            while(el.count('<|end_of_turn|>')>2):
                el = el.rsplit('<|end_of_turn|>',1)[0]
            array.append(el)
    return array

In [15]:
batch_size = 8

output_array = []
#iteration over prompts with tqdm
for prompt in tqdm(prompts, desc= 'Poems', position=0):
    prompt_copy = [prompt]*N_samples
    temp_output_array = []
    for i in range(0, len(prompt_copy), batch_size):
        model_inputs = tokenizer(prompt_copy[i:i+batch_size], return_tensors="pt", padding=True).to("cuda")
        try: 
            generated_ids = model.generate(**model_inputs, max_length=1200, pad_token_id=tokenizer.eos_token_id, eos_token_id= tokenizer.eos_token_id, do_sample=True, temperature=temperature)
        except:
            print("Cuda error")
            break
        output = tokenizer.batch_decode(generated_ids)
        temp_output_array.append(output)
    #flatten temp_output_array
    temp_output_array = [item for sublist in temp_output_array for item in sublist]
    temp_output_array = remove_the_excess_oes(temp_output_array)
    
    output_array.append(temp_output_array)
    
    # for i in range(5):
    #     #print(prompts[i])
    #     print(output[i].split('Here are my thoughts on the poem:')[1])
    #     print("#############################################")

Poems: 100%|██████████| 180/180 [2:49:44<00:00, 56.58s/it]  


In [16]:
import datetime
now = datetime.datetime.now()

df = pd.DataFrame({'prompt': prompts, 'N_response': output_array})

df['type'] = df.index.map(lambda x: 'base' if x%3 == 0 else 'like' if x%3 == 1 else 'dislike')

#create a file name with current date and time
date_time = now.strftime("%m_%d_%Y_%H_%M_%S")
file_name = ("Bo"+ str(N_samples) +"_" + model_name_or_path + "_" + date_time + "_" + dataset_filename.split(".")[0] + "_" + str(cut_off) + ".csv").replace("/", "_")
file_path = "results/" + file_name
print("file name:", file_name)

#save dataframe as csv
df.to_csv(file_path, index=False)

df.head()

file name: Bo32_openchat_openchat_3.5_11_28_2023_15_18_45_feedback_poem_300.csv


Unnamed: 0,prompt,N_response,type
0,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,base
1,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,like
2,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,dislike
3,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,base
4,GPT4 Correct User:Please comment briefly on th...,[<s> GPT4 Correct User:Please comment briefly ...,like
