In [12]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline
import torch
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from evaluate import evaluator
import pandas as pd
from nltk.corpus import stopwords

In [13]:
model_name = "google/bert_uncased_L-12_H-768_A-12"

In [14]:
train_df, val_df, test_df = pd.read_csv('../data/csv/train.csv'), pd.read_csv('../data/csv/val.csv'), pd.read_csv('../data/csv/test.csv')
stop_words = set(stopwords.words('english'))

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [16]:
def encode(examples):
    return tokenizer( examples["text"],
                     truncation=True, 
                     padding='max_length',
                     max_length=512)

In [17]:
train_df['text']=train_df['text'].str.lower().str.split().apply(lambda x: [item for item in x if item not in stop_words]).apply(lambda x: " ".join(x)).replace('\d+', '', regex=True)
train_dataset = Dataset.from_pandas(train_df)
val_df['text']= val_df['text'].str.lower().str.split().apply(lambda x: [item for item in x if item not in stop_words]).apply(lambda x: " ".join(x)).replace('\d+', '', regex=True)
val_dataset = Dataset.from_pandas(val_df)
test_df['text'] = test_df['text'].str.lower().str.split().apply(lambda x: [item for item in x if item not in stop_words]).apply(lambda x: " ".join(x)).replace('\d+', '', regex=True)
test_dataset = Dataset.from_pandas(test_df)

In [18]:
train_dataset, val_dataset, test_dataset = [dataset.map(encode, batched=True) for dataset in [train_dataset, val_dataset, test_dataset]]

Map:   0%|          | 0/7100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1380 [00:00<?, ? examples/s]

Map:   0%|          | 0/2998 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [21]:
metrics = ["accuracy"] #, "f1", "precision", "recall"]
    
for metric in metrics:
    
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        metric_evaluator = evaluate.load(metric)
        return metric_evaluator.compute(predictions=predictions, references=labels)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    
    id2label = {0: "NON_VIOLATED", 1: "VIOLATED"}
    label2id = {"NON_VIOLATED": 0, "VIOLATED": 1}
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)
    
    training_args = TrainingArguments(
        output_dir=f"../models/ECHR/bert_echr_model/removed_stopwords/{metric}/",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset= train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        )
    
    trainer.train()

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.309809,0.876087
2,0.339800,0.30753,0.873188


In [11]:
task_evaluator = evaluator("text-classification")
results = task_evaluator.compute(
    model_or_pipeline=model,
    data=test_dataset,
    metric=evaluate.combine(["accuracy", "precision", "recall", "f1"]),
    tokenizer=tokenizer,
    strategy="simple",
    random_state=0,
    input_column='text',
    label_column='labels',
    label_mapping={"NON_VIOLATED": 0.0, "VIOLATED": 1.0},
)
print(results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: The size of tensor a (2792) must match the size of tensor b (512) at non-singleton dimension 1