In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1,PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer


import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc

from src.utils.myutils import clean_memory,compute_metrics,preprocess_data

model_checkpoint = 'roberta-base'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.utils.logging.set_verbosity_error()
BATCH_SIZE = 32

In [2]:
babe = load_dataset('csv',data_files=PATH+"/data/EN/processed/BABE/babe_sg2.csv")['train']
wnc = load_dataset('csv',data_files=PATH+"/data/EN/processed/WNC/wnc.csv")['train'].train_test_split(test_size=0.1)

Using custom data configuration default-9aed32b2774fea6c
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-9aed32b2774fea6c/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-8a96bc76b9b8d191
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-8a96bc76b9b8d191/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Loading cached split indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-8a96bc76b9b8d191/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-6039b490736d23fd.arrow and /home/horyctom/.cache/huggingface/datasets/csv/default-8a96bc76b9b8d191/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-bf5a72ce117f714d.arrow


In [29]:
wnc

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 326691
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 36299
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint);
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
model.to(device);

In [4]:
train_tokenized = preprocess_data(wnc['train'],tokenizer,'sentence')
val_tokenized = preprocess_data(wnc['test'],tokenizer,'sentence')
babe_tokenized = preprocess_data(babe,tokenizer,'text')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-8a96bc76b9b8d191/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-bc23f2f9d2eb1647.arrow
Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-8a96bc76b9b8d191/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-d534be102d6fd42a.arrow


  0%|          | 0/4 [00:00<?, ?ba/s]

In [53]:
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=1000,
    disable_tqdm = False,
    warmup_steps=2000,
    save_total_limit=2,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=2e-5)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [50]:
def compute_metrics(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [54]:
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
trainer = Trainer(model,training_args,train_dataset=train_tokenized,eval_dataset=val_tokenized,compute_metrics=compute_metrics,data_collator=data_collator,
                      tokenizer=tokenizer)
trainer.train()

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_mo

Step,Training Loss,Validation Loss,F1
100,0.6969,0.692564,0.002075
200,0.6876,0.66358,0.622005
300,0.6557,0.617124,0.630144


***** Running Evaluation *****
  Num examples = 36299
  Batch size = 32
Saving model checkpoint to ./checkpoint-100
Configuration saved in ./checkpoint-100/config.json
Model weights saved in ./checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./checkpoint-100/tokenizer_config.json
Special tokens file saved in ./checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 36299
  Batch size = 32
Saving model checkpoint to ./checkpoint-200
Configuration saved in ./checkpoint-200/config.json
Model weights saved in ./checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./checkpoint-200/tokenizer_config.json
Special tokens file saved in ./checkpoint-200/special_tokens_map.json
Deleting older checkpoint [checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 36299
  Batch size = 32
Saving model checkpoint to ./checkpoint-300
Configuration saved in ./checkpoint-300/config.json
Model weights saved in ./che

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('../cs_babe.pth'))
