# Fine-Tuning Bert for text Classification - PROTOTYPING
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

### Load the prorotyping subset of the dataset.

In [9]:
import pandas as pd

df_train = pd.read_csv('ECHR_Dataset_Sub/EN_train_sub.csv')
df_dev = pd.read_csv('ECHR_Dataset_Sub/EN_dev_sub.csv')
df_test = pd.read_csv('ECHR_Dataset_Sub/EN_test_sub.csv')

In [11]:
print(df_train[['VIOLATED_ARTICLES']])

   VIOLATED_ARTICLES
0                 []
1                 []
2                 []
3                 []
4                 []
..               ...
95             ['6']
96             ['6']
97             ['6']
98        ['3', '5']
99             ['6']

[100 rows x 1 columns]


In [12]:
# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'LABEL']].sample(10))

   VIOLATED_ARTICLES  LABEL
97             ['6']      1
93             ['5']      1
33                []      0
95             ['6']      1
62             ['8']      1
25                []      0
34                []      0
72             ['6']      1
2                 []      0
32                []      0


### Prepare Dataset: use just text and label columns

In [13]:
# remove all columns except text and label
df_train = df_train[['TEXT', 'LABEL']]
df_dev = df_dev[['TEXT', 'LABEL']]
df_test = df_test[['TEXT', 'LABEL']]

print(df_train.head())

                                                TEXT  LABEL
0  ['1. The applicant, Mr Jusuf Nezirović, is a S...      0
1  ['4. The applicant, Mr İbrahim Acar was born i...      0
2  ['6. The applicant was born in 1977 and lives ...      0
3  ['The applicants, Monika and Sascha Freilinger...      0
4  ['The applicant, Mr Pavel Janata, is a Slovaki...      0


In [14]:
train_values = df_train['TEXT'].values
dev_values = df_dev['TEXT'].values
test_values = df_test['TEXT'].values

df_train['TEXT'] = ["".join(x) for x in train_values]
df_dev['TEXT'] = ["".join(x) for x in dev_values]
df_test['TEXT'] = ["".join(x) for x in test_values]


In [15]:
print(type(df_train['TEXT'][0]))
df_train['TEXT'][0]

<class 'str'>


"['1. The applicant, Mr Jusuf Nezirović, is a Slovenian national who was born in 1961 and lives in Trbovlje. He was represented before the Court by Mr Boštjan Verstovšek, a lawyer practising in Celje.', '2.', '3. On 12 October 1999 the applicant was injured in an industrial accident. The applicant’s employer had taken out insurance with the insurance company ZT.', '4. On 12 September 2001 the applicant instituted civil proceedings against ZT in the Ljubljana District Court (Okrožno sodišče v Ljubljani) seeking damages in the amount of 8,590,967 Slovenian tolars (approximately 35,800 euros) for the injuries sustained.', 'Between 1 October 2001 and 19 January 2006 the applicant lodged seven preliminary written submissions and/or adduced evidence.', 'Between 3 September 2003 and 21 September 2005 he made four requests that a date be set for a hearing. In letters to the applicant of 11 September 2003 and 28 June 2004, the judge explained that his case was not yet at the top of the list of 

In [16]:
# change name TEXT to text and LABEL to label
df_train.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

### Encode the text using Tokenizer pretrained form BERT

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [10]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [12]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3209 > 512). Running this sequence through the model will result in indexing errors


In [13]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
3209


In [14]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
print(dev_dataset_tokenized[0]['input_ids'].__len__())
dev_dataset_tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2968


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [15]:
type(dev_dataset_tokenized[0]['input_ids'])

list

### Instead of truncate to first 512, try to undarstand the best way to use the whole text.

In [19]:
def head_tail_tokens(dataset, head:int, tail:int):

    # from dataset transformers to dataframe pandas
    df = pd.DataFrame(dataset)

    # slice each input_ids value  only if head + tail < len(input_ids)
    df['input_ids'] = df['input_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
     # do the same for attention_mask
    df['attention_mask'] = df['attention_mask'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for token_type_ids
    df['token_type_ids'] = df['token_type_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)


    # convert back to dataset transformers
    dataset = Dataset.from_pandas(df)

    return dataset

In [17]:
train_head_tail = head_tail_tokens(train_dataset_tokenized, 250,250)
print(train_head_tail[1]['input_ids'].__len__())

dev_head_tail = head_tail_tokens(dev_dataset_tokenized, 250,250)
print(dev_head_tail[1]['input_ids'].__len__())

294
500


### Define model, metrics and training parameters.

In [18]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt", padding=True)

In [19]:
# evaluate
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
# labels
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [21]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_head_tail,         # training dataset
    eval_dataset=dev_head_tail,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


### Train

In [None]:
#trainer.train()

In [None]:
#trainer.save_model('model-echr-bert')

### Evaluate the model on the test set

In [4]:
# load trained model 
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("model-echr-bert")

In [6]:
from transformers import AutoTokenizer
tokenizer_echr = AutoTokenizer.from_pretrained("model-echr-bert")
# tokenize function for test 
def preprocess_echr(examples):
    return tokenizer_echr(examples["text"])

In [17]:
from datasets import Dataset
# encode test dataset with trained model
test_dataset = Dataset.from_pandas(df_test)
print(test_dataset)

test_dataset = Dataset.from_pandas(df_test)
test_dataset_tokenized = test_dataset.map(preprocess_echr, batched=True)


Dataset({
    features: ['text', 'label'],
    num_rows: 100
})


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1456 > 512). Running this sequence through the model will result in indexing errors


In [20]:
test_head_tail = head_tail_tokens(test_dataset_tokenized, 250,250)
print(test_head_tail[1]['input_ids'].__len__())

446


In [23]:
import torch
def metrics_model(dataset):

  predicted_labels = []
  labels = []

  for test in dataset:

    # convert to tensor
    test['input_ids'] = torch.Tensor(test['input_ids'])
    test['token_type_ids'] = torch.Tensor(test['token_type_ids'])
    test['attention_mask'] = torch.Tensor(test['attention_mask'])

    # reshape
    test['input_ids'] = test['input_ids'].reshape(1,-1).to(torch.int64)
    test['token_type_ids'] = test['token_type_ids'].reshape(1,-1).to(torch.int64)
    test['attention_mask'] = test['attention_mask'].reshape(1,-1).to(torch.int64)

    with torch.no_grad():
      logits = model(input_ids = test['input_ids'], token_type_ids = test['token_type_ids'], attention_mask = test['attention_mask']).logits

    predicted_class_id = logits.argmax().item()

    predicted_labels.append(predicted_class_id)
    labels.append(test['label'])
  
  return predicted_labels,labels

In [24]:
predicted_labels,labels = metrics_model(test_head_tail)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.78      0.42      0.55        50
           1       0.60      0.88      0.72        50

    accuracy                           0.65       100
   macro avg       0.69      0.65      0.63       100
weighted avg       0.69      0.65      0.63       100

