# Fine-Tuning Bert for text Classification - PROTOTYPING
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

### Load the prorotyping subset of the dataset.

In [None]:
import pandas as pd

df_train = pd.read_csv('ECHR_Dataset_Sub/EN_train_sub.csv')
df_dev = pd.read_csv('ECHR_Dataset_Sub/EN_dev_sub.csv')
df_test = pd.read_csv('ECHR_Dataset_Sub/EN_test_sub.csv')

In [None]:
print(df_train[['VIOLATED_ARTICLES']])

In [None]:
# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'LABEL']].sample(10))

### Prepare Dataset: use just text and label columns

In [None]:
# remove all columns except text and label
df_train = df_train[['TEXT', 'LABEL']]
df_dev = df_dev[['TEXT', 'LABEL']]
df_test = df_test[['TEXT', 'LABEL']]

print(df_train.head())

In [None]:
train_values = df_train['TEXT'].values
dev_values = df_dev['TEXT'].values
test_values = df_test['TEXT'].values

df_train['TEXT'] = ["".join(x) for x in train_values]
df_dev['TEXT'] = ["".join(x) for x in dev_values]
df_test['TEXT'] = ["".join(x) for x in test_values]


In [None]:
print(type(df_train['TEXT'][0]))
df_train['TEXT'][0]

In [None]:
# change name TEXT to text and LABEL to label
df_train.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

### Encode the text using Tokenizer pretrained form BERT

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [None]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
train_dataset

In [None]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

In [None]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

In [None]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
print(dev_dataset_tokenized[0]['input_ids'].__len__())
dev_dataset_tokenized


In [None]:
type(dev_dataset_tokenized[0]['input_ids'])

### Instead of truncate to first 512, try to undarstand the best way to use the whole text.

In [None]:
def head_tail_tokens(dataset, head:int, tail:int):

    # from dataset transformers to dataframe pandas
    df = pd.DataFrame(dataset)

    # slice each input_ids value  only if head + tail < len(input_ids)
    df['input_ids'] = df['input_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
     # do the same for attention_mask
    df['attention_mask'] = df['attention_mask'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for token_type_ids
    df['token_type_ids'] = df['token_type_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)


    # convert back to dataset transformers
    dataset = Dataset.from_pandas(df)

    return dataset

In [None]:
train_head_tail = head_tail_tokens(train_dataset_tokenized, 250,250)
print(train_head_tail[1]['input_ids'].__len__())

dev_head_tail = head_tail_tokens(dev_dataset_tokenized, 250,250)
print(dev_head_tail[1]['input_ids'].__len__())

### Define model, metrics and training parameters.

In [None]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt", padding=True)

In [None]:
# evaluate
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# labels
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_head_tail,         # training dataset
    eval_dataset=dev_head_tail,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


### Train

In [None]:
#trainer.train()

In [None]:
#trainer.save_model('model-echr-bert')

### Evaluate the model on the test set

In [None]:
# load trained model 
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("model-echr-bert")

In [None]:
from transformers import AutoTokenizer
tokenizer_echr = AutoTokenizer.from_pretrained("model-echr-bert")
# tokenize function for test 
def preprocess_echr(examples):
    return tokenizer_echr(examples["text"])

In [None]:
from datasets import Dataset
# encode test dataset with trained model
test_dataset = Dataset.from_pandas(df_test)
print(test_dataset)

test_dataset = Dataset.from_pandas(df_test)
test_dataset_tokenized = test_dataset.map(preprocess_echr, batched=True)


In [None]:
test_head_tail = head_tail_tokens(test_dataset_tokenized, 250,250)
print(test_head_tail[1]['input_ids'].__len__())

In [None]:
import torch
def metrics_model(dataset):

  predicted_labels = []
  labels = []

  for test in dataset:

    # convert to tensor
    test['input_ids'] = torch.Tensor(test['input_ids'])
    test['token_type_ids'] = torch.Tensor(test['token_type_ids'])
    test['attention_mask'] = torch.Tensor(test['attention_mask'])

    # reshape
    test['input_ids'] = test['input_ids'].reshape(1,-1).to(torch.int64)
    test['token_type_ids'] = test['token_type_ids'].reshape(1,-1).to(torch.int64)
    test['attention_mask'] = test['attention_mask'].reshape(1,-1).to(torch.int64)

    with torch.no_grad():
      logits = model(input_ids = test['input_ids'], token_type_ids = test['token_type_ids'], attention_mask = test['attention_mask']).logits

    predicted_class_id = logits.argmax().item()

    predicted_labels.append(predicted_class_id)
    labels.append(test['label'])
  
  return predicted_labels,labels

In [None]:
predicted_labels,labels = metrics_model(test_head_tail)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(labels, predicted_labels))