In [1]:
# !mkdir -p /scratch/sagarsj42/torch-cache
# !mkdir -p /scratch/sagarsj42/transformers

import os
os.chdir('scratch/')
os.environ['TORCH_HOME'] = 'scratch/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = 'scratch/transformers'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
!pwd

/home/hitesh/Documents/IIIT-H/IRE/major-project/sample/sagar/scratch


In [3]:
from functools import partial

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, load_dataset, load_metric

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

In [4]:
TRAIN_DATA_FILE = 'irse.train.csv'
DEV_DATA_FILE = 'irse.dev.csv'
TEST_DATA_FILE = 'irse.test.csv'
GIVEN_TEST_FILE = 'irse.given-test.csv'
MODEL_KEY = 'microsoft/deberta-base'
EXP_NAME = 'irse-deberta'

TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 16
ACCUMULATE_GRAD_STEPS = 2
N_EPOCHS = 5
LEARNING_RATE = 6e-5
SCHEDULER_TYPE = 'cosine'
LR_WARMUP_RATIO = 0.4
LOG_STEPS = 50
SEED = 43419

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_KEY)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_KEY)

tokenizer, model

Downloading:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

(PreTrainedTokenizerFast(name_or_path='microsoft/deberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)}),
 DebertaForSequenceClassification(
   (deberta): DebertaModel(
     (embeddings): DebertaEmbeddings(
       (word_embeddings): Embedd

In [6]:
tokenizer.model_max_length = 512

tokenizer.sep_token, tokenizer.model_max_length

('[SEP]', 512)

In [7]:
data_files = {
    'train': TRAIN_DATA_FILE,
    'dev': DEV_DATA_FILE,
    'test': TEST_DATA_FILE,
    'giventest': GIVEN_TEST_FILE
}
ds = load_dataset('csv', data_files=data_files)

ds

Using custom data configuration default-beab39c7f7b93254


Downloading and preparing dataset csv/default to /home/hitesh/.cache/huggingface/datasets/csv/default-beab39c7f7b93254/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/hitesh/.cache/huggingface/datasets/csv/default-beab39c7f7b93254/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 1001
    })
})

In [8]:
def tokenizer_func(batch, tokenizer):
    text = [batch['Comments'][i] + tokenizer.sep_token + batch['Surrounding Code Context'][i] 
        for i in range(len(batch['Comments']))]
    tok = tokenizer(text, max_length=tokenizer.model_max_length, truncation=True, 
        padding=True, return_attention_mask=True)

    return tok

In [9]:
tokenizer_partial = partial(tokenizer_func, tokenizer=tokenizer)
ds_tok = dict()

ds_tok['train'] = ds['train'].map(tokenizer_partial, batched=True, batch_size=TRAIN_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['dev'] = ds['dev'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['test'] = ds['test'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok['giventest'] = ds['giventest'].map(tokenizer_partial, batched=True, batch_size=EVAL_BATCH_SIZE, 
    load_from_cache_file=False)
ds_tok = DatasetDict(ds_tok)

ds_tok

  0%|          | 0/1339 [00:00<?, ?ba/s]

  0%|          | 0/38 [00:00<?, ?ba/s]

  0%|          | 0/43 [00:00<?, ?ba/s]

  0%|          | 0/63 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1001
    })
})

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    scores = logits[:, -1]
    
    acc_metric = load_metric('accuracy')
    f1_metric = load_metric('f1')
    mcc_metric = load_metric('matthews_correlation')
    roc_metric = load_metric('roc_auc')

    metrics_dict = {
        'accuracy': acc_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'f1_score': f1_metric.compute(predictions=predictions, references=labels)['f1'],
        'matthews_cc': mcc_metric.compute(predictions=predictions, references=labels)['matthews_correlation'],
        'roc_auc_score': roc_metric.compute(prediction_scores=scores, references=labels)['roc_auc'],
    }

    return metrics_dict

In [11]:
training_args = TrainingArguments(
    output_dir=EXP_NAME,
    run_name=EXP_NAME,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=ACCUMULATE_GRAD_STEPS,
    num_train_epochs=N_EPOCHS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=SCHEDULER_TYPE,
    warmup_ratio=LR_WARMUP_RATIO,
    evaluation_strategy='epoch',
    logging_steps=LOG_STEPS,
    save_strategy='epoch',
    metric_for_best_model='f1_score',
    greater_is_better=True,
    load_best_model_at_end=True,
    no_cuda=False,
    seed=SEED,
    fp16=False,
    dataloader_drop_last=False
)

training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_na

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='microsoft/deberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_tok['train'],
    eval_dataset=ds_tok['dev'],
    compute_metrics=compute_metrics
)

trainer

<transformers.trainer.Trainer at 0x7f2c33844610>

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: Surrounding Code Context, Class, Comments. If Surrounding Code Context, Class, Comments are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5354
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 3345
  Number of trainable parameters = 139193858


  0%|          | 0/3345 [00:00<?, ?it/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 

: 

In [14]:
preds, labels, metrics = trainer.predict(ds_tok['test'])

preds.shape, labels.shape, metrics

The following columns in the test set  don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: Class, Surrounding Code Context, Comments. If Class, Surrounding Code Context, Comments are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 678
  Batch size = 16


((678, 2),
 (678,),
 {'test_loss': 0.029858974739909172,
  'test_accuracy': 0.9941002949852508,
  'test_f1_score': 0.9926470588235294,
  'test_matthews_cc': 0.987739677290841,
  'test_roc_auc_score': 0.9999728666395333,
  'test_runtime': 16.4625,
  'test_samples_per_second': 41.185,
  'test_steps_per_second': 2.612})

In [15]:
gpreds, glabels, gmetrics = trainer.predict(ds_tok['giventest'])

gpreds.shape, glabels.shape, gmetrics

The following columns in the test set  don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: Class, Surrounding Code Context, Comments. If Class, Surrounding Code Context, Comments are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1001
  Batch size = 16


((1001, 2),
 (1001,),
 {'test_loss': 0.020255913957953453,
  'test_accuracy': 0.996003996003996,
  'test_f1_score': 0.99288256227758,
  'test_matthews_cc': 0.9901169518111971,
  'test_roc_auc_score': 0.999985204036339,
  'test_runtime': 21.8853,
  'test_samples_per_second': 45.739,
  'test_steps_per_second': 2.879})

In [16]:
classes = ['Not Useful', 'Useful']
output_df = pd.read_csv(GIVEN_TEST_FILE)
output_df.drop('label', axis=1, inplace=True)
output_df['Predicted Class'] = list(map(lambda v: classes[v], gpreds.argmax(axis=1)))

print(output_df.info())

output_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Comments                  1001 non-null   object
 1   Surrounding Code Context  1001 non-null   object
 2   Class                     1001 non-null   object
 3   Predicted Class           1001 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB
None


Unnamed: 0,Comments,Surrounding Code Context,Class,Predicted Class
0,/*READ_INT_FUNCTIONS*/,-5. if (png_ptr != NULL)\n-4. png_war...,Not Useful,Not Useful
1,/*Put the chunk name into png_ptr->chunk_name.*/,"-2. png_read_data(png_ptr, buf, 8);\n-1. ...",Not Useful,Not Useful
2,/*critical*/,"-8. png_crc_read(png_structrp png_ptr, png_byt...",Not Useful,Not Useful
3,/*READ_iCCP|iTXt|pCAL|sCAL|sPLT|tEXt|zTXt|SEQU...,-9. {\n-8. if (warn != 0)\n-7. ...,Not Useful,Not Useful
4,/*ZLIB_VERNUM >= 0x1240*/,#if ZLIB_VERNUM >= 0x1240\n\n/*ZLIB_VERNUM >= ...,Not Useful,Not Useful


In [17]:
output_df[output_df['Class'] == output_df['Predicted Class']]

Unnamed: 0,Comments,Surrounding Code Context,Class,Predicted Class
0,/*READ_INT_FUNCTIONS*/,-5. if (png_ptr != NULL)\n-4. png_war...,Not Useful,Not Useful
1,/*Put the chunk name into png_ptr->chunk_name.*/,"-2. png_read_data(png_ptr, buf, 8);\n-1. ...",Not Useful,Not Useful
2,/*critical*/,"-8. png_crc_read(png_structrp png_ptr, png_byt...",Not Useful,Not Useful
3,/*READ_iCCP|iTXt|pCAL|sCAL|sPLT|tEXt|zTXt|SEQU...,-9. {\n-8. if (warn != 0)\n-7. ...,Not Useful,Not Useful
4,/*ZLIB_VERNUM >= 0x1240*/,#if ZLIB_VERNUM >= 0x1240\n\n/*ZLIB_VERNUM >= ...,Not Useful,Not Useful
...,...,...,...,...
996,/*The following document where the background ...,-5. #define PNG_CMAP_NONE 0\n\n /*The fol...,Useful,Useful
997,/*Do all the *safe* initialization - 'safe' me...,-1. } png_image_read_control;\n/* Do all the *...,Useful,Useful
998,/*And set the rest of the structure to NULL to...,-10. * called from here must *not* call png_m...,Useful,Useful
999,"/*Use png_ptr here, not info_ptr, because by e...",-8. static png_uint_32\n-7. png_image_format(p...,Useful,Useful


In [18]:
output_df.drop_duplicates(inplace=True)

output_df

Unnamed: 0,Comments,Surrounding Code Context,Class,Predicted Class
0,/*READ_INT_FUNCTIONS*/,-5. if (png_ptr != NULL)\n-4. png_war...,Not Useful,Not Useful
1,/*Put the chunk name into png_ptr->chunk_name.*/,"-2. png_read_data(png_ptr, buf, 8);\n-1. ...",Not Useful,Not Useful
2,/*critical*/,"-8. png_crc_read(png_structrp png_ptr, png_byt...",Not Useful,Not Useful
3,/*READ_iCCP|iTXt|pCAL|sCAL|sPLT|tEXt|zTXt|SEQU...,-9. {\n-8. if (warn != 0)\n-7. ...,Not Useful,Not Useful
4,/*ZLIB_VERNUM >= 0x1240*/,#if ZLIB_VERNUM >= 0x1240\n\n/*ZLIB_VERNUM >= ...,Not Useful,Not Useful
...,...,...,...,...
996,/*The following document where the background ...,-5. #define PNG_CMAP_NONE 0\n\n /*The fol...,Useful,Useful
997,/*Do all the *safe* initialization - 'safe' me...,-1. } png_image_read_control;\n/* Do all the *...,Useful,Useful
998,/*And set the rest of the structure to NULL to...,-10. * called from here must *not* call png_m...,Useful,Useful
999,"/*Use png_ptr here, not info_ptr, because by e...",-8. static png_uint_32\n-7. png_image_format(p...,Useful,Useful


In [19]:
outfile_name = '_'.join(EXP_NAME.split('-')) + '_Secondary_Results_iREL.csv'

outfile_name

'irse_deberta_Secondary_Results_iREL.csv'

In [None]:
output_df.to_csv(outfile_name, index=False)