# Method-level


### Imports


In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, SummarizationPipeline, AutoConfig, RobertaTokenizer
from pathlib import Path
from tqdm import tqdm
from transformers import logging
import warnings
import pandas as pd

logging.set_verbosity_error()
warnings.filterwarnings("ignore")

2024-04-26 11:13:22.875301: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-26 11:13:22.924219: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 11:13:22.924260: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 11:13:22.925586: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 11:13:22.934306: I tensorflow/core/platform/cpu_feature_guar

### Define constants

In [2]:
DATA_DIR = Path("../../data")

PREPROCESSED_DATA_DIR = DATA_DIR / "preprocessed"
PREDICTED_DATA_DIR = DATA_DIR / "predicted"

LEVEL = 'method'

### Read data


In [3]:
file_path = PREPROCESSED_DATA_DIR / f"{LEVEL}-level-with-class.jsonl"
df = pd.read_json(file_path, lines=True).set_index("class_id")
df

Unnamed: 0_level_0,class_code,method_code,method_summary
class_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,"def filter(self, request):\n request_ur...",Filter the incoming request based on certain r...
ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,"def is_start_with(self, request_uri):\n ...",Check if the request URI starts with certain p...
ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,"def get_jwt_user(self, request):\n toke...",Get the user information from the JWT token in...
ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,"def set_current_user_info_and_log(self, user):...",Set the current user information and log the a...
ClassEval_1_sum,"import math\n\n\nclass AreaCalculator:\n """"...",def calculate_circle_area(self):\n retu...,calculate the area of circle based on self.radius
...,...,...,...
ClassEval_98_sum,import xml.etree.ElementTree as ET\n\n\nclass ...,"def process_xml_data(self, file_name):\n ...",Modifies the data in XML elements and writes t...
ClassEval_98_sum,import xml.etree.ElementTree as ET\n\n\nclass ...,"def find_element(self, element_name):\n ...",Finds the XML elements with the specified name.
ClassEval_99_sum,import zipfile\n\n\nclass ZipFileProcessor:\n ...,def read_zip_file(self):\n try:\n ...,Get open file object
ClassEval_99_sum,import zipfile\n\n\nclass ZipFileProcessor:\n ...,"def extract_file(self, file_name, output_path)...",Extract the file with the specified name from ...


### Define the summarization pipeline


In [4]:
model_names = [
    "SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune",
    "SEBIS/code_trans_t5_large_code_documentation_generation_python_multitask_finetune",
    "Salesforce/codet5-base-multi-sum",
    "Paul-B98/codet5p_220m_py_sum",
    "lintang/pile-t5-large-codexglue",
]
MODEL_NAME = model_names[4]

MODEL_DIR = PREDICTED_DATA_DIR / MODEL_NAME.split("/")[-1]
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
config = AutoConfig.from_pretrained(MODEL_NAME)

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, config=config)

if MODEL_NAME == 'Paul-B98/codet5p_220m_py_sum':
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-220m")
else:
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME, config=config, skip_special_tokens=True, legacy=False, padding=True
    )
pipeline = SummarizationPipeline(model, tokenizer, config, device="cuda")

config.json:   0%|          | 0.00/797 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

### Inference (applying summarization pipeline)


Using [Multinomial sampling Beam Search](https://huggingface.co/docs/transformers/main/en/generation_strategies#beam-search-multinomial-sampling)

In [50]:
%%time
tqdm.pandas()
df["pred_summary"] = df["method_code"].progress_apply(
    lambda x: pipeline(x, do_sample=True, num_beams=5)[0]["summary_text"]
)
df.head()

100%|██████████| 400/400 [02:38<00:00,  2.52it/s]

CPU times: user 2min 38s, sys: 164 ms, total: 2min 39s
Wall time: 2min 38s





Unnamed: 0_level_0,method_code,method_summary,pred_summary
class_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ClassEval_0_sum,"def filter(self, request):\n request_ur...",Filter the incoming request based on certain r...,1. Get the request path and method from the re...
ClassEval_0_sum,"def is_start_with(self, request_uri):\n ...",Check if the request URI starts with certain p...,Create a method is_start_with that will check ...
ClassEval_0_sum,"def get_jwt_user(self, request):\n toke...",Get the user information from the JWT token in...,1. Get the JWT user from the request.\n2. If t...
ClassEval_0_sum,"def set_current_user_info_and_log(self, user):...",Set the current user information and log the a...,Create a method set_current_user_info_and_log ...
ClassEval_1_sum,def calculate_circle_area(self):\n retu...,calculate the area of circle based on self.radius,"calculate_circle_area(self):\n """"""\n ..."


### Save predictions

In [52]:
df = df.reset_index()
df.to_json(MODEL_DIR / f"{LEVEL}-level-pred.jsonl", orient='records', lines=True)