In [1]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)

train_df = pd.read_json('/home/ubuntu/filesystem/hatexplain/Data/output_Dataset_train.json')
train_df = train_df.transpose()

# Extract relevant columns for inputs (code diff) and labels (CVE description)
train_df['code_diff'] = train_df['diff_tokens'].apply(lambda x: ' '.join(x))  # Joining tokens for code diffs
train_df['cve_desc'] = train_df['cve_desc_tokens'].apply(lambda x: ' '.join(x))  # Joining tokens for CVE descriptions

# Convert the DataFrame to a Hugging Face Dataset
train_dataset_ = Dataset.from_pandas(train_df[['code_diff', 'cve_desc']])
print(train_dataset_)

  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|██████████| 2/2 [00:34<00:00, 17.43s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s]


Dataset({
    features: ['code_diff', 'cve_desc', '__index_level_0__'],
    num_rows: 40827
})


In [2]:
import re

instruction = "Based on the following code diff, generate the corresponding CVE description and commit message."

# Function to combine the instruction with the code diff
def combine_instruction_and_diff(instruction, code_diff):
    return f"{instruction}\nCode Diff:\n{code_diff}"

# Function to extract CVE identifier
def extract_cve(cve_string):
    match = re.match(r'(CVE-\d{4}-\d+)', cve_string)
    return match.group(1) if match else None

train_dataset = train_dataset_.map(lambda example: {
    'code_diff': combine_instruction_and_diff(instruction, example['code_diff']),
    'cve': extract_cve(example['__index_level_0__'])
})
train_dataset[1]


Map: 100%|██████████| 40827/40827 [00:01<00:00, 20597.47 examples/s]


{'code_diff': "Based on the following code diff, generate the corresponding CVE description and commit message.\nCode Diff:\n<s> diff  -- git  a / lib / rack / directory . rb  b / lib / rack / directory . rb \n index  b 08 f 59 49 .. d 68 f 36 a 9  100 644 \n ---  a / lib / rack / directory . rb \n +++  b / lib / rack / directory . rb \n @@  - 106 , 13  + 106 , 12  @@  table  {  width : 100 %% ;  } \n   \n          def  list _ directory ( path _ info ,  path ,  script _ name ) \n              files  =  [[ ' ../ ',  ' Parent  Directory ',  ' ',  ' ',  '' ]] \n -            glob  =  :: File . join ( path ,  ' * ') \n   \n              url _ head  =  ( script _ name . split (' / ')  +  path _ info . split (' / ') ). map  do  | part | \n                  Rack :: Ut ils . escape _ path  part \n              end \n   \n -            Dir [ gl ob ]. sort . each  do  | node | \n +            Dir . ent ries ( path ). re ject  {  | e |  e . start _ with ? (' .' )  } . sort . each  do  | node | \n

In [3]:

# Function to tokenize inputs (code diffs with instruction)
def tokenize_function(examples):
    inputs = tokenizer(examples['code_diff'], padding="max_length", truncation=True)
    outputs = tokenizer(examples['cve_desc'], padding="max_length", truncation=True)
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 40827/40827 [32:17<00:00, 21.08 examples/s]


In [1]:
tokenized_train_dataset[616]['input_ids']

NameError: name 'tokenized_train_dataset' is not defined

In [4]:
print(len(tokenized_train_dataset[0]['input_ids']))

131072


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_train_dataset 
)

trainer.train()

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: cve_desc, __index_level_0__, cve, code_diff. If cve_desc, __index_level_0__, cve, code_diff are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40827
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5104


Step,Training Loss
10,17.0493
20,16.5793
30,15.5537
40,13.8691
50,11.2748
60,8.3674
70,5.8679
80,3.9113
90,3.0542
100,2.2026


KeyboardInterrupt: 