# Fine-Tuning Bert for text Classification
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

In [1]:
import pandas as pd
from src.utils import load_ECHR

# load train, dev and test dataset from json to pandas dataframe
df_train, df_dev, df_test = load_ECHR()

In [2]:
# add a column with 0/1 labels to the dataframe 0 if VIOLATED_ARTICLE is empty, 1 otherwise
df_train['LABEL'] = df_train['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_dev['LABEL'] = df_dev['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_test['LABEL'] = df_test['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)

# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'LABEL']].sample(10))

     VIOLATED_ARTICLES  LABEL
5704                []      0
5154           [13, 8]      1
805                 []      0
168                [6]      1
3673               [6]      1
2269               [6]      1
5172               [6]      1
1037                []      0
5587                []      0
4437                []      0


In [3]:
# remove all columns except text and label
df_train = df_train[['TEXT', 'LABEL']]
df_dev = df_dev[['TEXT', 'LABEL']]
df_test = df_test[['TEXT', 'LABEL']]

print(df_train.head())

                                                TEXT  LABEL
0  [7. On 28 September 1994 the applicant's husba...      0
1  [8. The applicant was born in 1974 and lives i...      0
2  [5. The first applicant, Mr Ivan Dvořáček, was...      1
3  [4. The applicant was born in 1959 and lives i...      1
4  [6. The applicant was born in 1946., 7. On 14 ...      1


In [4]:
train_values = df_train['TEXT'].values
dev_values = df_dev['TEXT'].values
test_values = df_test['TEXT'].values

df_train['TEXT'] = [" ".join(x) for x in train_values]
df_dev['TEXT'] = [" ".join(x) for x in dev_values]
df_test['TEXT'] = [" ".join(x) for x in test_values]


In [5]:
type(df_train['text'][0])
df_train['text']

0       7. On 28 September 1994 the applicant's husban...
1       8. The applicant was born in 1974 and lives in...
2       5. The first applicant, Mr Ivan Dvořáček, was ...
3       4. The applicant was born in 1959 and lives in...
4       6. The applicant was born in 1946. 7. On 14 Au...
                              ...                        
7095    5. The applicant was born in 1943 and lives in...
7096    The applicant, Mr Dušan Václavík, is a Slovaki...
7097    4. The applicant was born in 1976 and is curre...
7098    The applicants are relatives. They are all Slo...
7099    The applicant, a Dutch national, was born in 1...
Name: TEXT, Length: 7100, dtype: object

In [18]:
# change name TEXT to text and LABEL to label
df_train.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [20]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [21]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset 
train_dataset = Dataset.from_pandas(df_train)
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7100
})

In [22]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

Map:   0%|          | 0/7100 [00:00<?, ? examples/s]

In [23]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7100
})
512


In [24]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
dev_dataset_tokenized


Map:   0%|          | 0/1380 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1380
})

In [27]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
# evaluate 
import evaluate 
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [29]:
# labels 
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [30]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset_tokenized,         # training dataset
    eval_dataset=dev_dataset_tokenized,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()

In [None]:
# test model
from transformers import pipeline

classifier = pipeline('text-classification', model='bert_echr')

classifier(df_test['text'][0])