In [1]:
import os
os.environ["HF_HOME"] = "/workspace/s3/hf"

In [3]:
from datasets import load_from_disk
dst = load_from_disk("../s3/hf/datasets/python_code_cleaned")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from sklearn.metrics import f1_score, accuracy_score
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM

def predict_with_model(model, tokenizer, dataset, batch_size: int, max_length: int = 2048, max_gen_tokens=100):
    tokenized = dataset.map(lambda x: tokenizer(x["text"], padding="max_length", 
                                                add_special_tokens=False, max_length=max_length,
                                                truncation=True), batched=True, num_proc=4)

    predictions = []
    for batch_start in tqdm(range(0, len(dataset), batch_size)):
        tokenized_batch = tokenized[batch_start:batch_start+batch_size]
        
        inputs = torch.tensor(tokenized_batch["input_ids"]).to("cuda")
        attention_mask = torch.tensor(tokenized_batch["attention_mask"]).to("cuda")
        generated_tokens = model.generate(inputs=inputs,
                                attention_mask=attention_mask,
                                max_new_tokens=max_gen_tokens)

        generated_tokens = generated_tokens.to("cpu")
        predicted_names = tokenizer.batch_decode(generated_tokens[:, max_length:], skip_special_tokens=True)
        predictions.extend(predicted_names)

        # Free cuda memory
        del generated_tokens
        del inputs
        del attention_mask
        import gc
        gc.collect()
        torch.cuda.empty_cache()

    return predictions

def calculate_accuracy(true_tokens, predicted_tokens, tokenizer):
    # Convert tokens to text
    true_texts = [tokenizer.decode(tokens) for tokens in true_tokens]
    predicted_texts = [tokenizer.decode(tokens) for tokens in predicted_tokens]

    # Calculate accuracy
    accuracy = accuracy_score(true_texts, predicted_texts)

    return accuracy

# Tokenize
def apply_template(x, tokenizer):
    return {"text": tokenizer.apply_chat_template(
        [{"role": "system", "content": "Generate a fitting name for the provided function. Place the suggested function name inside tripplet backtics. E.g ```adder```"},{"role": "user", "content": x["source_code"]}],
        tokenize=False)
    }

import re

def extract_function_name(text):
    match = re.search(r"```(.*?)```", text)
    if match:
        return match.group(1)
    return text


def evaluate_with_model(model, tokenizer, dataset, batch_size: int, max_length: int = 2048, max_gen_tokens=100, transform_predictions=lambda x: x):
    # model = None
    predictions = predict_with_model(model, tokenizer, dataset, batch_size, max_length, max_gen_tokens)
    predictions = transform_predictions(predictions)
    return {
        "accuracy": accuracy_score(predictions, dataset["function_name"]),
        "predictions": predictions
    }

In [5]:
# free memory
import gc
gc.collect()

torch.cuda.empty_cache()

In [6]:
model_name = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Since we are able to bit the model in memory, we don't have to go for peft
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.83s/it]


In [7]:
len(dst["test"])

2000

In [8]:
dataset = dst["test"].map(apply_template, fn_kwargs={"tokenizer": tokenizer})
results = evaluate_with_model(model, tokenizer, dataset, 4, transform_predictions=lambda x: [extract_function_name(y) for y in x])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]
No chat template is defined for this tokenizer - using the default template for the CodeLlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Map: 100%|██████████| 2000/2000 [00:00<00:00, 8368.59 examples/s]
Map (num_proc=4): 100%|██████████| 2000/2000 [00:00<00:00, 2133.56 examples/s]
  0%|          | 0/500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/500 [00:03<25:25,  3.06s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/500 [00:05<22:43,  2.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 3/500 [00:08<22:47,  2.75s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 4/500 [00:11<22

In [10]:
results["accuracy"]
results["predictions"]

['youtube_id_extractor',
 'url_extractor',
 'hasher',
 'fc2video_downloader',
 'download_video',
 'xml_to_dict',
 'ucas_downloader',
 'ucas_downloader',
 'download_video',
 'sina_flv_downloader',
 'sina_video_downloader',
 'yixia_miaopai_downloader',
 'veoh_downloader',
 'veoh_downloader',
 'downloader',
 'get_room_id_from_url',
 'colorizer',
 'colorizer',
 'error',
 ' The provided function is a way to determine the operating system (OS) of the system it is running on. The function takes no arguments and returns a string representing the OS.\n\nA fitting name for this function could be `get_os()`. This name is descriptive and concise, and it clearly conveys the purpose of the function.\n\nHere is the function with the suggested name:\n```\ndef get_os():\n    syst = system().lower()\n',
 'weibo_downloader',
 'vimeo_downloader',
 'vimeo_downloader',
 'vimeo_downloader',
 'ckplayer_info',
 'decryptor',
 'mgtv_video_id',
 'm3u_downloader',
 'git_branch_commit_parser',
 'sanitize_filename',

In [12]:
tokenizer.default_chat_template

"{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must