In [None]:
#!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate
!pip install hugginface_hub
!pip install accelerate --upgrade
!pip install bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes --upgrade
!pip install sentencepiece

In [2]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
import torch

import datasets
import evaluate
import numpy as np
import accelerate
import bitsandbytes

from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

tokenizer_13b = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")

model_13b = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-hf",
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [3]:
import gzip
import shutil

with gzip.open('Sts.gz', 'rb') as f_in:
    with open('sts.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

def read_sts_dataset(filename):
    with open(filename, 'r') as f:
        sts_data = f.readlines()
    return sts_data

sts_data = read_sts_dataset('sts.txt')


In [4]:
def parse_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 6:
                entry = {
                    'score': float(parts[4]),
                    'sentence1': parts[5],
                    'sentence2': parts[6]
                }
                data.append(entry)

    return data

In [5]:
dataset = parse_file("sts.txt")
first_150_pairs = dataset[:150]
first_50_pairs = dataset[:50]

def generate_scores_for_dataset(model, tokenizer, dataset):
    results = []
    for pair in dataset:
        predicted_score = generate_similarity_score_zero_shot(model, pair['sentence1'], pair['sentence2'], tokenizer)
        results.append({
            'sentence1': pair['sentence1'],
            'sentence2': pair['sentence2'],
            'actual_score': pair.get('score'),
            'predicted_score': predicted_score
        })
    return results

def generate_similarity_score_zero_shot(model, sentence1, sentence2, tokenizer):
    prompt = f"""Given the two sentences:\n- \"{sentence1}\"\n- \"{sentence2},
    choose the best score (0 not similar at all, 5 very similar): 0, 1, 2, 3, 4, 5.
    The answer is:"""
    model_response = invoke_model_zero_shot(model, prompt, tokenizer)
    predicted_score = model_response
    return predicted_score

def generate_similarity_score_few_shot_2_examples(model, sentence1, sentence2, tokenizer):
  prompt = f"""
  Sentence 1: A cat is rubbing against baby's face. \n
  Sentence2: A cat is rubbing against a baby. \n
  Similarity score: 3.800 \n
  Sentence 1: A woman is writing.	\n
  Sentence 2: A woman is swimming. \n
  Similarity score: 0.500\n
  Given the two sentences:\n- \"{sentence1}\"\n- \"{sentence2},
  choose the best similarity score (0 not similar at all, 5 very similar):
  0, 1, 2, 3, 4, 5. The answer is:
  """
  model_response = invoke_model_few_shot_2_examples(model, prompt, tokenizer)
  predicted_score = model_response
  return predicted_score

def generate_similarity_score_few_shot_4_examples(model, sentence1, sentence2, tokenizer):
  prompt = f"""
  Sentence 1: A cat is rubbing against baby's face. \n
  Sentence2: A cat is rubbing against a baby. \n
  Similarity score: 3.800 \n
  Sentence 1: A woman is writing.	\n
  Sentence 2: A woman is swimming. \n
  Similarity score: 0.500\n
  Sentence 1: A young girl is sitting on Santa's lap.\n
  Sentence 2: A little girl is sitting on Santa's lap.\n
  Similarity score: 4.800\n
  Sentence 1: A grey hound is active in a grassy field.\n
  Sentnece 2: A bunch of bikes racing on a track.\n
  Similairty score: 0.000
  Given the two sentences:\n- \"{sentence1}\"\n- \"{sentence2},
  choose the best similarity score (0 not similar at all, 5 very similar):
  0, 1, 2, 3, 4, 5. The answer is:
  """
  model_response = invoke_model_few_shot_4_examples(model, prompt, tokenizer)
  predicted_score = model_response
  return predicted_score

def invoke_model_zero_shot(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=100, num_return_sequences=1, temperature=0.5, top_k=50, top_p=0.95, num_beams=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def invoke_model_few_shot_2_examples(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=250, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def invoke_model_few_shot_4_examples(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [7]:
import pandas as pd
#free generation

df_zero_shot = pd.DataFrame(first_50_pairs)
#df_zero_shot['Predicted Score LLaMA-13b'] = df_zero_shot.apply(lambda x: generate_similarity_score_zero_shot(model_13b, x['sentence1'], x['sentence2'], tokenizer_13b), axis=1)

csv_file_path = "results_zero_shot.csv"
df_zero_shot.to_csv(csv_file_path, index=False)
###
df_few_shot_2_examples = pd.DataFrame(first_50_pairs)
df_few_shot_2_examples['Predicted Score LLaMA-13b'] = df_few_shot_2_examples.apply(lambda x: generate_similarity_score_few_shot_2_examples(model_13b, x['sentence1'], x['sentence2'], tokenizer_13b), axis=1)

df_few_shot_2_examples.to_csv("results_few_shot_2_examples.csv", index=False)

df_few_shot_4_examples = pd.DataFrame(first_50_pairs)
df_few_shot_4_examples['Predicted Score LLaMA-13b'] = df_few_shot_4_examples.apply(lambda x: generate_similarity_score_few_shot_4_examples(model_13b, x['sentence1'], x['sentence2'], tokenizer_13b), axis=1)

df_few_shot_4_examples.to_csv("results_few_shot_4_examples.csv", index=False)

In [8]:
#from RL hw2 ex3, adapted for the task
def get_log_prob_of_similarity_score(model, tokenizer, sentence1, sentence2, actual_score, device):
    prompt = f"""Given the two sentences:\n- "{sentence1}"\n- "{sentence2}"\n,
    choose the best score (0 not similar at all, 5 very similar):
    0, 1, 2, 3, 4, 5. The answer is:"""
    score_str = str(actual_score)
    input_ids_prompt = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024)['input_ids'].to(device)
    input_ids_score = tokenizer(score_str, return_tensors='pt')['input_ids'].to(device)

    input_ids = torch.cat((input_ids_prompt, input_ids_score[:, 1:]), dim=-1)

    attention_mask = (input_ids != tokenizer.eos_token_id).long()
    position_ids = attention_mask.cumsum(-1) - 1

    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)

    logits_score = out.logits[:, input_ids_prompt.shape[-1]-1:-1].squeeze()
    log_probs = torch.nn.functional.log_softmax(logits_score, dim=-1)
    log_probs_score = log_probs.gather(dim=-1, index=input_ids_score[:, 1:].squeeze().unsqueeze(-1)).squeeze()
    # mean proabbility across the score token
    mean_log_prob_score = log_probs_score.mean().item()

    return mean_log_prob_score

def evaluate_dataset(data, model, tokenizer, device):
    results = []
    for pair in data:
        log_prob_score = get_log_prob_of_similarity_score(
            model=model,
            tokenizer=tokenizer,
            sentence1=pair['sentence1'],
            sentence2=pair['sentence2'],
            actual_score=pair['score'],
            device=device
        )
        result = {
            'sentence1': pair['sentence1'],
            'sentence2': pair['sentence2'],
            'actual_score': pair['score'],
            'log_prob_score': log_prob_score
        }
        results.append(result)
    return results

evaluated_results_13b = evaluate_dataset(first_150_pairs, model_13b, tokenizer_13b, device)


In [11]:
def results_to_csv(evaluated_results_13b, filepath):
    df_13b = pd.DataFrame(evaluated_results_13b)
    df_13b.to_csv(filepath, index=False)

results_to_csv(evaluated_results_13b, 'log_probs_13b.csv')

In [12]:
df_log_probs = pd.read_csv('log_probs_13b.csv')

average_log_prob_13b_chat = df_log_probs['log_prob_score'].mean()
variance_log_prob_13b_chat = df_log_probs['log_prob_score'].var()

print(f"Average Log Probability Scores:")
print(f"LLaMA 13b: {average_log_prob_13b_chat}")
print(f"\nVariance of Log Probability Scores:")
print(f"LLaMA 13b: {variance_log_prob_13b_chat}")

Average Log Probability Scores:
LLaMA 13b: -2.5402399269739786

Variance of Log Probability Scores:
LLaMA 13b: 0.0398037940230249
