# LLMs Inference

In [1]:
import torch
import warnings
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import logging

# logging.set_verbosity_error()
# warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = Path("../../data")
VECTOR_STORES_DIR = DATA_DIR / "vector-stores"
PREPROCESSED_DATA_DIR = DATA_DIR / "preprocessed"
PREDICTED_DATA_DIR = DATA_DIR / "predicted"

levels = ["method", "class", "repo"]
datasets = [
    "mce",  # method (+few shot), class
    "mcsn",  # method (+few shot), repo (+few shot)
]

LEVEL = levels[2]
DATASET = datasets[1]

In [3]:
file_path = PREPROCESSED_DATA_DIR / f"method-level-{DATASET}.jsonl"
df = pd.read_json(file_path, lines=True)
df.head(1)

Unnamed: 0,repo_name,method_name,method_code,method_summary,original_method_code,method_path
0,apache/airflow,HttpHook.run,"def run(self, endpoint, data=None, headers=Non...",Performs the request,"def run(self, endpoint, data=None, headers=Non...",airflow/hooks/http_hook.py


In [4]:
if LEVEL != 'class':
    fs_file_path = PREPROCESSED_DATA_DIR / f"method-level-{DATASET}-few-shot.jsonl"
    fs_df = pd.read_json(fs_file_path, lines=True)
    fs_df.head(1)

In [6]:
model_names = [
    "deepseek-ai/deepseek-coder-1.3b-instruct",
    "deepseek-ai/deepseek-coder-6.7b-instruct",
    "deepseek-ai/deepseek-coder-33b-instruct",
    "bigcode/starcoder2-15b-instruct-v0.1",
    "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
]

idx = 3
MODEL_NAME = model_names[idx]
MODEL_KEY = "dsc" if idx in [0, 1, 2] else "sc" if idx == 3 else "ll"

MODEL_MAX_LENGTH = 16384  # From model config
PROMPT_MAX_LENGTH = 3000  # From experiments with maximum possible prompt

MODEL_DIR = PREDICTED_DATA_DIR / MODEL_NAME.split("/")[-1]
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
def load_vector_store(repo_name):
    return FAISS.load_local(
        VECTOR_STORES_DIR / repo_name.replace("/", "_"),
        EMBEDDINGS,
        allow_dangerous_deserialization=True,
    )

if LEVEL == 'repo':
    REPOS = df["repo_name"].unique().tolist()
    EMBEDDINGS = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={'device': 'cuda:2'},
        encode_kwargs={'normalize_embeddings': True},
    )
    VECTOR_STORES = {repo_name: load_vector_store(repo_name) for repo_name in REPOS}

In [8]:
SYSTEM = "You're a specialized AI assisting with Python code summaries, deeply knowledgeable in computer science.\n"

METHOD_INSTRUCTION = "Concisely summarize the Python code provided in 1-3 sentences."

CLASS_CONTEXT_INSTRUCTION = (
    "Consider the following class code as additional context for your response:\n"
)
CLASS_INSTRUCTION = (
    "Concisely summarize the following Python function in 1-3 sentences:\n"
)

REPO_CONTEXT_INSTRUCTION = "You have the following repository context, which includes fragments of code with their corresponding paths and lines from the repository:\n\n"
REPO_INSTRUCTION_PREFIX = "Your task is to summarize the Python function located at "
REPO_INSTRUCTION_SUFFIX = (
    " concisely in 1-3 sentences, based on the provided context:\n"
)

K = 50


def retrieve_repo_context(method_code, repo_name, original_method_code, k=K):
    context = VECTOR_STORES[repo_name].similarity_search(method_code, k=k)
    return "\n\n".join(
        [
            f"File path: {d.metadata['file_path']}\nFile content:\n```\n{d.page_content}\n```"
            for d in context
            if d.page_content not in original_method_code
        ]
    )


def construct_few_shot_list(df, num=0, repo_level=False):
    df = df.sample(num, random_state=42)
    few_shot = []
    for code, summary in zip(df["method_code"], df["method_summary"]):
        if repo_level:
            few_shot.append({"role": "user", "content": f"{CLASS_INSTRUCTION}\n{code}"})
        else:
            few_shot.append(
                {"role": "user", "content": f"{code}\n{METHOD_INSTRUCTION}"}
            )
        few_shot.append({"role": "assistant", "content": f"{summary}"})
    return few_shot


# def get_repo_context(repo_context, tokenizer, percent=1.0):
#     # Calcuate max_index according to model params and percent
#     max_length = MODEL_MAX_LENGTH - PROMPT_MAX_LENGTH
#     max_index = int(max_length * percent)
#     join_str = '"File path: '
#     # Tokenize the repo_context
#     ids = tokenizer.encode(repo_context)
#     # Truncate the tokens array according to percent of repo_context
#     ids = ids[:max_index]
#     # Split by join_str and trim last element because it can be truncated
#     new_repo_context = tokenizer.decode(ids).split(join_str)[:-1]
#     # Join back
#     new_repo_context = join_str.join(new_repo_context)
#     return new_repo_context

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto"  # 1
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
# Test for documents length in tokens
# import numpy as np
# k = 50
# for idx in range(10):
#     lengths = []
#     tokens = []
#     docs = VECTOR_STORES[df['repo_name'].iloc[idx]].similarity_search(df['method_code'].iloc[idx], k=k)
#     for doc in docs:
#         tok_doc = tokenizer.encode(doc.page_content)
#         lengths.append(len(doc.page_content))
#         tokens.append(len(tok_doc))
#     print(np.mean(lengths), np.mean(tokens), np.mean(tokens) * k)

In [11]:
print(f"Model memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"Memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f} B")

Model memory footprint: 32.25 GB
Memory allocated: 7.59 GB
Number of parameters: 15.96 B


In [12]:
GENERATION_CONFIGS = {
    "dsc" : {
        "max_new_tokens": 128,
        "temperature": 0.0,
        "do_sample": False,
        "top_k": 50,
        "top_p": 0.95,
        "num_return_sequences": 1,
        "eos_token_id": tokenizer.eos_token_id
    },
    "sc" : {
        "max_new_tokens": 128,
        "temperature": 0.0,
        "do_sample": False,
        "top_k": 50,
        "top_p": 0.95,
        "num_return_sequences": 1,
        "eos_token_id": [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("###")
        ],
        "pad_token_id": tokenizer.eos_token_id,
    },
    "ll" : {
        "max_new_tokens": 128,
        "temperature": 0.0,
        "do_sample": False,
        "top_k": 50,
        "top_p": 0.95,
        "num_return_sequences": 1,
        "eos_token_id": [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ],
    },
}

In [16]:
def method_level_pipeline(method_code, model, tokenizer, model_key, few_shots=[]):
    system_message = {
        "role": "system" if model_key != "sc" else "user",
        "content": SYSTEM,
    }
    main_message = {"role": "user", "content": f"{method_code}\n{METHOD_INSTRUCTION}"}
    messages = [system_message] + few_shots + [main_message]
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
        inputs,
        **GENERATION_CONFIGS[model_key],
    )
    return (
        tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
        .split("###")[0]
        .split("</s>")[0]
        .strip()
    )


def class_level_pipeline(method_code, class_context, model, tokenizer, model_key):
    system_message = {
        "role": "system" if model_key != "sc" else "user",
        "content": SYSTEM,
    }
    class_message = {
        "role": "user",
        "content": f"{CLASS_CONTEXT_INSTRUCTION}{class_context}",
    }
    main_message = {"role": "user", "content": f"{CLASS_INSTRUCTION}{method_code}"}
    messages = [system_message, class_message, main_message]
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
        inputs,
        **GENERATION_CONFIGS[model_key],
    )
    return (
        tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
        .split("###")[0]
        .split("</s>")[0]
        .strip()
    )


def repo_level_pipeline(row, model, tokenizer, model_key, few_shots=[], k=K):
    method_code, method_path, repo_name, original_method_code = row
    system_message = {
        "role": "system" if model_key != "sc" else "user",
        "content": SYSTEM,
    }
    if k > 0:
        repo_context = retrieve_repo_context(
            method_code, repo_name, original_method_code, k=k
        )
        repo_message = [
            {
                "role": "user",
                "content": f"{REPO_CONTEXT_INSTRUCTION}{repo_context}",
            }
        ]
    else:
        repo_message = []
    main_message = {
        "role": "user",
        "content": f"{REPO_INSTRUCTION_PREFIX}{method_path}{REPO_INSTRUCTION_SUFFIX}{method_code}",
    }
    messages = [system_message] + repo_message + few_shots + [main_message]
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)
    with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
        outputs = model.generate(
            inputs,
            **GENERATION_CONFIGS[model_key],
        )
    return (
        tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
        .split("###")[0]
        .split("</s>")[0]
        .strip()
    )

In [14]:
NUM_FEW_SHOTS = 0 if LEVEL == "class" else 0
if LEVEL == "repo":
    NUM_FEW_SHOTS = 10

few_shots = (
    []
    if LEVEL == "class"
    else (
        construct_few_shot_list(fs_df, num=NUM_FEW_SHOTS)
        if LEVEL == "method"
        else construct_few_shot_list(fs_df, num=NUM_FEW_SHOTS, repo_level=True)
    )
)

In [None]:
%%time
tqdm.pandas()
if LEVEL == 'method':
    df["pred_summary"] = df["method_code"].progress_apply(
        lambda x: method_level_pipeline(x, model, tokenizer, MODEL_KEY, few_shots)
    )
elif LEVEL == 'class':
    df["class_pred_summary"] = df.progress_apply(
        lambda x: class_level_pipeline(x.get("method_code"), x.get("class_code"), model, tokenizer, MODEL_KEY), axis=1
    )
    df["skeleton_pred_summary"] = df.progress_apply(
        lambda x: class_level_pipeline(x.get("method_code"), x.get("skeleton"), model, tokenizer, MODEL_KEY), axis=1
    )
elif LEVEL == 'repo':
    df["pred_summary"] = df[["method_code", "method_path", "repo_name", "original_method_code"]].progress_apply(
        lambda x: repo_level_pipeline(x, model, tokenizer, MODEL_KEY, few_shots=few_shots, k=50), axis=1
    )
df

In [70]:
df = df.reset_index()
if LEVEL != 'class':
    df.to_json(
        MODEL_DIR / f"{LEVEL}-level-{DATASET}-few-shot-{NUM_FEW_SHOTS}-pred.jsonl",
        orient="records",
        lines=True,
    )
else:
    df.to_json(
        MODEL_DIR / f"{LEVEL}-level-{DATASET}-pred.jsonl",
        orient="records",
        lines=True,
    )

In [16]:
tdf

Unnamed: 0,repo_name,method_name,method_code,method_summary,original_method_code,method_path
529,Azure/azure-sdk-for-python,ServiceManagementService.add_disk,"def add_disk(self, has_operating_system, label...",Adds a disk to the user image repository. The ...,"def add_disk(self, has_operating_system, label...",azure-servicemanagement-legacy/azure/servicema...
132,apache/airflow,HdfsSensorRegex.poke,"def poke(self, context):\n sb = self.ho...",poke matching files in a directory with self.r...,"def poke(self, context):\n """"""\n ...",airflow/contrib/sensors/hdfs_sensor.py
512,Azure/azure-sdk-for-python,ServiceManagementService.delete_dns_server,"def delete_dns_server(self, service_name, depl...",Deletes a DNS server from a deployment.,"def delete_dns_server(self, service_name, depl...",azure-servicemanagement-legacy/azure/servicema...
641,Azure/azure-sdk-for-python,_convert_etree_element_to_queue,def _convert_etree_element_to_queue(entry_elem...,Converts entry element to queue object. The fo...,def _convert_etree_element_to_queue(entry_elem...,azure-servicebus/azure/servicebus/control_clie...
303,apache/airflow,AWSAthenaHook.get_conn,def get_conn(self):\n if not self.conn:...,check if aws conn exists already or create one...,"def get_conn(self):\n """"""\n chec...",airflow/contrib/hooks/aws_athena_hook.py


In [None]:
tqdm.pandas()
# tdf = df.sample(1)
tdf = df.iloc[341:342,]
display(tdf)
tdf["pred_summary"] = tdf[["method_code", "method_path", "repo_name", "original_method_code"]].progress_apply(
    lambda x: repo_level_pipeline(
        x,
        model,
        tokenizer,
        MODEL_KEY,
        few_shots,
        k=50,
    ),
    axis=1,
)
for row in tdf.itertuples():
    print("=" * 20)
    print(row.method_summary)
    print("=" * 20)
    print(row.pred_summary)
    print("=" * 20)
    print()

In [82]:
idx = 7
method_code = df["method_code"].iloc[idx]
method_summary = df["method_summary"].iloc[idx]
# class_context = df["class_code"].iloc[idx]
method_path = df["method_path"].iloc[idx]
repo_context = ''

system_message = {"role": "system" if MODEL_KEY != "sc" else "user", "content": SYSTEM}
# main_message = {"role": "user", "content": f"{method_code}\n{METHOD_INSTRUCTION}"}

# class_message = {"role": "user", "content": f"{CLASS_CONTEXT_INSTRUCTION}{class_context}"}
# main_message = {"role": "user", "content": f"{CLASS_INSTRUCTION}{method_code}"}
repo_message = {
    "role": "user",
    "content": f"{REPO_CONTEXT_INSTRUCTION}{repo_context}",
}
main_message = {
    "role": "user",
    "content": f"{REPO_INSTRUCTION_PREFIX}{method_path}{REPO_INSTRUCTION_SUFFIX}{method_code}",
}
messages = [system_message, repo_message] + few_shots + [main_message]
inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"#, tokenize=False
).to(model.device)
print(len(inputs[0]))
# import time
# start_time = time.time()
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
#     outputs = model.generate(
#         inputs,
#         **GENERATION_CONFIGS[MODEL_KEY],
#     )
# print(f"Generated in {time.time() - start_time} seconds.")
# print(tokenizer.decode(inputs[0]))
# print(tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True).split("###")[0].split("</s>")[0].strip())
# print("GOLD Response:")
# print(method_summary)

2717
