In [None]:
!pip install transformers accelerate bitsandbytes
!pip install torch -U
!pip install transformers -U

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import f1_score, mean_squared_error
from tqdm import tqdm

In [None]:
from huggingface_hub import login
login()

In [None]:
# Initialize the model and tokenizer
# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "meta-llama/Llama-2-13b-chat-hf"
access_token = ""
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_auth_token=access_token,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=access_token)

In [None]:
# Load the test dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
test_csv_path = '/content/drive/My Drive/test_dataset.csv'
test_df = pd.read_csv(test_csv_path)

In [None]:
y_true = []
y_pred = []
responses = []

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    story = row['modified']
    position = row['position']
    prompt = f"There is one major continuity error, a lapse in the self-consistency of the narrative, in the story provided. Count the number of '<nl>' in the story that occurs before the continuity error. Return that number only. Do not return any other information in your response. Story: {story}"

    # Generate model output
    model_inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    output = model.generate(**model_inputs)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    print(response)
    responses.append(response)

    # Extract predicted position from the response
    try:
        predicted_position = int(''.join(filter(str.isdigit, response)))  # Extracts digits and converts to integer
        y_pred.append(predicted_position)
    except ValueError:
        y_pred.append(-1)  # Handle non-numeric responses

    y_true.append(position)

In [None]:
df_response = pd.DataFrame(responses, columns=['response'])
df_preds = pd.DataFrame(y_pred, columns=['preds'])
df_true = pd.DataFrame(y_true, columns=['true'])

df_response.to_csv("test_responses_13b.csv", index=False)
df_preds.to_csv("test_preds_13b.csv", index=False)
df_true.to_csv("test_true_13b.csv", index=False)

from google.colab import files
files.download("test_preds_13b.csv")
files.download("test_true_13b.csv")
files.download("test_responses_13b.csv")

Extract quantitiative data from LLM output

In [None]:
!pip install pandas openpyxl

In [None]:
import pandas as pd
import re

# Load the test dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
df = pd.read_excel("/content/drive/MyDrive/test_continuity_error.xlsx", engine='openpyxl')

In [None]:
import re
def extract_position(response):
    try:
        sentences = re.split(r'\.\s+', response.strip())
        last_sentence = sentences[-1]
        position = re.findall(r'\d+', last_sentence)
        if position:
            return int(position[-1])
        else:
            return 0  # In case no number is found
    except Exception:
        return 0

df_response['llama_13b'] = df_response['response'].apply(extract_position)
df_response['true'] = df_true["true"]

df_response.to_csv('cleaned_responses_13b.csv', index=False)
files.download('cleaned_responses_13b.csv')

Run Evals

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
df = pd.read_csv("/content/drive/MyDrive/test_llama_continuity_error.csv")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import roc_auc_score, confusion_matrix
import numpy as np

# Convert true and predicted positions to arrays for scoring
y_true = df_true["true"]
y_pred = df_response["llama_13b"]

# Classification Metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)

# Regression Metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r_squared = r2_score(y_true, y_pred)

# Print all metrics
print(f'Classification Metrics:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}')
print(f'Regression Metrics:\nMSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nR-squared: {r_squared}')

In [None]:
from google.colab import runtime
runtime.unassign()