# Fine-Tuning Bert for text Classification
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

In [None]:
# package required
!pip install transformers
!pip install datasets
!pip install evaluate

In [None]:
pip install accelerate -U

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

import sys
sys.path.append('/content/gdrive/hlt-project')

Mounted at /content/gdrive/


In [None]:
%cd gdrive/MyDrive/'hlt-project'

/content/gdrive/MyDrive/hlt-project


In [None]:
import pandas as pd

df_train = pd.read_csv('ECHR_Dataset_Sub/EN_train_sub.csv')
df_dev = pd.read_csv('ECHR_Dataset_Sub/EN_dev_sub.csv')
df_test = pd.read_csv('ECHR_Dataset_Sub/EN_test_sub.csv')

In [None]:
print(df_train[['VIOLATED_ARTICLES']])

   VIOLATED_ARTICLES
0                 []
1                 []
2         ['2', '6']
3        ['13', '6']
4              ['5']
..               ...
95                []
96                []
97                []
98        ['3', '8']
99       ['6', 'P1']

[100 rows x 1 columns]


In [None]:
# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'LABEL']].sample(10))

   VIOLATED_ARTICLES  LABELS
1                 []       1
41             ['6']       1
87                []       1
21                []       1
85                []       1
99       ['6', 'P1']       1
16             ['6']       1
22                []       1
97                []       1
76                []       1


In [None]:
# remove all columns except text and label
df_train = df_train[['TEXT', 'LABEL']]
df_dev = df_dev[['TEXT', 'LABEL']]
df_test = df_test[['TEXT', 'LABEL']]

print(df_train.head())

In [None]:
train_values = df_train['TEXT'].values
dev_values = df_dev['TEXT'].values
test_values = df_test['TEXT'].values

df_train['TEXT'] = ["".join(x) for x in train_values]
df_dev['TEXT'] = ["".join(x) for x in dev_values]
df_test['TEXT'] = ["".join(x) for x in test_values]


In [None]:
print(type(df_train['TEXT'][0]))
df_train['TEXT'][0]

In [None]:
# change name TEXT to text and LABEL to label
df_train.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [None]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
train_dataset

In [None]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3209 > 512). Running this sequence through the model will result in indexing errors


In [None]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
3209


In [None]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
print(dev_dataset_tokenized[0]['input_ids'].__len__())
dev_dataset_tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2968


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [None]:
type(dev_dataset_tokenized[0]['input_ids'])

list

In [None]:
def head_tail_tokens(dataset, head:int, tail:int):

    # from dataset transformers to dataframe pandas
    df = pd.DataFrame(dataset)

    # slice each input_ids value  only if head + tail < len(input_ids)
    df['input_ids'] = df['input_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
     # do the same for attention_mask
    df['attention_mask'] = df['attention_mask'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for token_type_ids
    df['token_type_ids'] = df['token_type_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)


    # convert back to dataset transformers
    dataset = Dataset.from_pandas(df)

    return dataset

In [None]:
train_head_tail = head_tail_tokens(train_dataset_tokenized, 250,250)
print(train_head_tail[1]['input_ids'].__len__())

dev_head_tail = head_tail_tokens(dev_dataset_tokenized, 250,250)
print(dev_head_tail[1]['input_ids'].__len__())

294
500


In [None]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [None]:
# evaluate
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# labels
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_head_tail,         # training dataset
    eval_dataset=dev_head_tail,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.686692,0.52
2,0.673500,0.670039,0.6
3,0.640800,0.635995,0.69
4,0.640800,0.624033,0.64
5,0.579600,0.613142,0.67


TrainOutput(global_step=35, training_loss=0.6180447237832206, metrics={'train_runtime': 270.209, 'train_samples_per_second': 1.85, 'train_steps_per_second': 0.13, 'total_flos': 128472195000000.0, 'train_loss': 0.6180447237832206, 'epoch': 5.0})

In [None]:
trainer.save_model('model-echr-bert')

In [None]:
test_dataset = Dataset.from_pandas(df_test)

In [None]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("model-echr-bert")
inputs = tokenizer(test_dataset[7]['text'], return_tensors="pt", truncation=True)

In [None]:
inputs

{'input_ids': tensor([[  101,  1031,  1005,  1017,  1012,  1996,  2862,  1997, 17362,  1998,
          1996,  7882,  4751,  1997,  1996,  5097,  2024,  2275,  2041,  1999,
          1996, 10439, 21945,  2795,  1012,  1005,  1010,  1005,  1018,  1012,
          1996, 17362, 10865,  1997,  1996, 11664,  3091,  1997,  2037,  3653,
          1011,  3979, 12345,  1012,  1999,  4646,  2053,  1012,  5354,  2575,
         17134,  1013,  2403,  1010,  1996, 23761,  2036,  2992,  2060, 10821,
          2104,  1996,  8910,  1997,  1996,  4680,  1012,  1005,  1033,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("model-echr-bert")

import torch

with torch.no_grad():
  logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'negative'

In [None]:
test_dataset[7]['label']

1

In [None]:
def model_accuracy(dataset):

  correct = 0

  for test in dataset:

    inputs = tokenizer(test['text'], return_tensors="pt", truncation=True)
    with torch.no_grad():
      logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()

    if predicted_class_id == test['label']:
      correct = correct + 1

  return correct / len(dataset)

In [None]:
model_accuracy(test_dataset)

0.71

In [None]:
!pip install pipeline

Collecting pipeline
  Downloading pipeline-0.1.0-py3-none-any.whl (2.6 kB)
Installing collected packages: pipeline
Successfully installed pipeline-0.1.0


In [None]:
!pip install xformers

Collecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyre-extensions==0.0.29 (from xformers)
  Downloading pyre_extensions-0.0.29-py3-none-any.whl (12 kB)
Collecting typing-inspect (from pyre-extensions==0.0.29->xformers)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect->pyre-extensions==0.0.29->xformers)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing-inspect, pyre-extensions, xformers
Successfully installed mypy-extensions-1.0.0 pyre-extensions-0.0.29 typing-inspect-0.9.0 xformers-0.0.20


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="model-echr-bert", tokenizer = tokenizer, device = 0)
classifier(test_dataset[50]['text'], truncation = True)

[{'label': 'negative', 'score': 0.8151789307594299}]

In [None]:
test_dataset[50]['label']

0

In [None]:
def pipeline_metrics(dataset):

  predicted_labels = []
  labels = []

  for test in dataset:

    classifier = pipeline("text-classification", model="model-echr-bert", tokenizer = tokenizer, device = 0)
    predicted_class_label = classifier(test['text'], truncation = True)

    predicted_class_id = label2id[predicted_class_label[0]['label']]

    predicted_labels.append(predicted_class_id)
    labels.append(test['label'])

  return predicted_labels,labels

In [None]:
predicted_labels,labels = pipeline_metrics(test_dataset)

In [None]:
def tokenize_test(examples):
  return tokenizer(examples['text'])

In [None]:
# do the same for df_test
test_dataset = Dataset.from_pandas(df_test)
test_dataset_tokenized = dev_dataset.map(tokenize_test, batched=True)
print(dev_dataset_tokenized[0]['input_ids'].__len__())
dev_dataset_tokenized

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2968


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [None]:
test_head_tail = head_tail_tokens(test_dataset_tokenized, 250,250)
print(test_head_tail[1]['input_ids'].__len__())

500


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("model-echr-bert")


In [None]:
import torch
def pipeline_metrics_head_tail(dataset):

  predicted_labels = []
  labels = []

  for test in dataset:

    with torch.no_grad():
      logits = model(test['input_ids']).logits

    predicted_class_id = logits.argmax().item()

    predicted_labels.append(predicted_class_id)
    labels.append(test['label'])

  return predicted_labels,labels

In [None]:
predicted_labels,labels = pipeline_metrics_head_tail(test_head_tail)

TypeError: ignored

In [None]:
from sklearn.metrics import classification_report

print(classification_report(labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.78      0.58      0.67        50
           1       0.67      0.84      0.74        50

    accuracy                           0.71       100
   macro avg       0.73      0.71      0.71       100
weighted avg       0.73      0.71      0.71       100

