# Fine-Tuning Bert for text Classification
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

In [5]:
import pandas as pd
from src.utils import load_ECHR, load_ECHR_small

# load train, dev and test dataset from json to pandas dataframe
df_train, df_dev, df_test = load_ECHR_small()

# save to csv
#df_train.to_csv('ECHR_Dataset_Small/EN_train_small.csv', index=False)
#df_dev.to_csv('ECHR_Dataset_Small/EN_dev_small.csv', index=False)
#df_test.to_csv('ECHR_Dataset_Small/EN_test_small.csv', index=False)

In [7]:
# read from csv file
#import pandas as pd
#df_train = pd.read_csv('ECHR_Dataset_Small/EN_train_small.csv')
#df_test = pd.read_csv('ECHR_Dataset_Small/EN_test_small.csv')
#df_dev = pd.read_csv('ECHR_Dataset_Small/EN_dev_small.csv')

In [8]:
df_train['TEXT']

0     ["7. On 28 September 1994 the applicant's husb...
1     ['8. The applicant was born in 1974 and lives ...
2     ['5. The first applicant, Mr Ivan Dvořáček, wa...
3     ['4. The applicant was born in 1959 and lives ...
4     ['6. The applicant was born in 1946.', '7. On ...
                            ...                        
95    ['The applicant is a Ukrainian citizen, born i...
96    ['6. The applicants were born in 1944, 1946 an...
97    ['1. The case was referred to the Court by the...
98    ['5. The applicant was born in 1979 and lives ...
99    ['4. The applicant was born in 1959 and lives ...
Name: TEXT, Length: 100, dtype: object

In [9]:
# add a column with 0/1 labels to the dataframe 0 if VIOLATED_ARTICLE is empty, 1 otherwise
df_train['LABEL'] = df_train['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_dev['LABEL'] = df_dev['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_test['LABEL'] = df_test['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)

# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'LABEL']].head(10))

  VIOLATED_ARTICLES  LABEL
0                []      0
1                []      0
2        ['2', '6']      1
3       ['13', '6']      1
4             ['5']      1
5       ['13', '6']      1
6                []      0
7             ['6']      1
8        ['5', '6']      1
9                []      0


In [10]:
# remove all columns except text and label
df_train = df_train[['TEXT', 'LABEL']]
df_dev = df_dev[['TEXT', 'LABEL']]
df_test = df_test[['TEXT', 'LABEL']]

print(df_train.head())

                                                TEXT  LABEL
0  ["7. On 28 September 1994 the applicant's husb...      0
1  ['8. The applicant was born in 1974 and lives ...      0
2  ['5. The first applicant, Mr Ivan Dvořáček, wa...      1
3  ['4. The applicant was born in 1959 and lives ...      1
4  ['6. The applicant was born in 1946.', '7. On ...      1


In [11]:
train_values = df_train['TEXT'].values
dev_values = df_dev['TEXT'].values
test_values = df_test['TEXT'].values

df_train['TEXT'] = ["".join(x) for x in train_values]
df_dev['TEXT'] = ["".join(x) for x in dev_values]
df_test['TEXT'] = ["".join(x) for x in test_values]


In [12]:
type(df_train['TEXT'][0])
df_train['TEXT']

0     ["7. On 28 September 1994 the applicant's husb...
1     ['8. The applicant was born in 1974 and lives ...
2     ['5. The first applicant, Mr Ivan Dvořáček, wa...
3     ['4. The applicant was born in 1959 and lives ...
4     ['6. The applicant was born in 1946.', '7. On ...
                            ...                        
95    ['The applicant is a Ukrainian citizen, born i...
96    ['6. The applicants were born in 1944, 1946 an...
97    ['1. The case was referred to the Court by the...
98    ['5. The applicant was born in 1979 and lives ...
99    ['4. The applicant was born in 1959 and lives ...
Name: TEXT, Length: 100, dtype: object

In [13]:
# change name TEXT to text and LABEL to label
df_train.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

In [14]:
# check if 0/1 class is balanced
df_train['label'].value_counts()

label
1    61
0    39
Name: count, dtype: int64

In [15]:
df_test['label'].value_counts()

label
1    34
0    16
Name: count, dtype: int64

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [10]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset 
train_dataset = Dataset.from_pandas(df_train)
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [11]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
512


In [13]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
dev_dataset_tokenized


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50
})

In [14]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# evaluate 
import evaluate 
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
# labels 
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset_tokenized,         # training dataset
    eval_dataset=dev_dataset_tokenized,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [19]:
trainer.train()



  0%|          | 0/35 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.save_model("bert-echr-classification")