In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
pip install -q -U accelerate==0.23.0 bitsandbytes==0.41.1 transformers==4.34.1

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from scipy.stats import entropy
train = pd.read_csv("../input/h2oai-predict-the-llm/train.csv")
test = pd.read_csv("../input/h2oai-predict-the-llm/test.csv")

In [None]:
model_path = "/kaggle/input/llama-2/pytorch/7b-hf/1"
model_name = "7b-hf"
random_seed = 25
temperature = 0.0
device_map = {"": 0}

In [None]:
train = pd.read_csv("../input/h2oai-predict-the-llm/train.csv")
test = pd.read_csv("../input/h2oai-predict-the-llm/test.csv")

train = train.reset_index().rename({"index":"id"}, axis=1)

train.fillna(" ",inplace=True)
test.fillna(" ",inplace=True)

np.random.seed(random_seed)
salt = "kappa"
train["group_id"] = train.Question.apply(lambda x: hash(x + salt) % 1_000_000)
test["group_id"] = test.Question.apply(lambda x: hash(x + salt) % 1_000_000)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(model_path,
                                             quantization_config=bnb_config,
                                             use_cache = False, 
                                             device_map=device_map)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
train.head()

In [None]:
def extract_features(prompt, answer, model):

    # Convert the prompt and answer to PyTorch tensors.
    input_ids = tokenizer(answer)
    output_ids = tokenizer(answer)

    # Pad the tensors to the same length as the longest one.
    input_tokens = len(input_ids.input_ids)
    output_tokens = len(output_ids.input_ids)
    max_length = max(input_tokens, output_tokens)
    input_ids = tokenizer.pad(input_ids, padding='max_length', max_length=max_length, return_tensors="pt").input_ids.reshape((1, -1))
    output_ids = tokenizer.pad(output_ids, padding='max_length', max_length=max_length, return_tensors="pt").input_ids.reshape((1, -1))

    # Call the model to generate predictions for the prompt.
    outputs = model(input_ids, labels=output_ids)

    # Extract the loss and logits tensors from the model's output.
    loss = outputs.loss
    probs = outputs.logits.softmax(-1)
    ids = output_ids.tolist()[0][1:]

    # Calculate the cross-entropy loss using the logits tensor.
    tokens = []
    logprobs = []
    l = 0
    val_ids = 0
    for i, id in enumerate(ids):
        p = probs[0,i,id].item()
        token = tokenizer.decode(id)
        tokens.append(token)
        logprob = math.log(p)
        logprobs.append(logprob)
        if token != '</s>':
            l -= logprob
            val_ids += 1

    # Calculate the estimated loss.
    estimated_loss = l / val_ids
    mean_lowest25 = np.mean(sorted(logprobs)[:25])
    mean_highest25 = np.mean(sorted(logprobs)[-25:])
    maxp = np.max(logprobs)
    minp = np.min(logprobs)
    rangep = maxp - minp
    meanp = np.mean(logprobs)
    stdp = np.std(logprobs)
    entropyp = entropy(np.exp(logprobs))
    if stdp != 0:
        kurtosisp = np.mean((logprobs - meanp)**4) / stdp ** 4
        skewnessp = np.mean((logprobs - meanp)**3) / stdp ** 3
    else:
        kurtosisp = 0
        skewnessp = 0
    perplexityp = np.exp(-np.mean(logprobs))

    return [
        estimated_loss,
        mean_lowest25,
        mean_highest25,
        maxp,
        minp,
        rangep,
        meanp,
        stdp,
        entropyp,
        kurtosisp,
        skewnessp,
        perplexityp,
    ]

In [None]:
def compute_features(df, model, model_name):

    new_df = list()
    base_features = ["estimated_loss", "mean_lowest25", "mean_highest25", "max", "min", "range", "mean", "std", "entropy", "kurtosis", "skewness", "perplexity"]
    df_features = [f"{model_name}_{item}" for item in base_features]

    for i in tqdm(range(len(df))):
        prompt = df.Question.iloc[i]
        answer = df.Response.iloc[i]
        new_df.append(extract_features(prompt, answer, model))

    new_df = pd.DataFrame(new_df, columns=df_features)
    return new_df


In [None]:
logprob_train = compute_features(train, model, model_name)
logprob_test = compute_features(test, model, model_name)

In [None]:
logprob_train.head()

In [None]:
logprob_test.head()

In [None]:
logprob_train.to_csv(f"{model_name}_logprob_train.csv", index=False)
logprob_test.to_csv(f"{model_name}logprob_test.csv", index=False)