# Fine-Tuning Bert for text Classification
## ECHR Violation Prediction
Following the tutorial of [huggingface](https://huggingface.co/docs/transformers/tasks/sequence_classification)

In [42]:
import pandas as pd
from src.utils import load_ECHR, load_ECHR_small, subsampling

# load train, dev and test dataset from json to pandas dataframe
df_train, df_dev, df_test = load_ECHR('ECHR_Dataset')


In [43]:
df_train_sub = subsampling(df_train, n=50)
df_dev_sub = subsampling(df_dev, n=50)
df_test_sub = subsampling(df_test, n=50)

In [46]:
# save to csv
df_train_sub.to_csv('ECHR_Dataset_Sub/EN_train_sub.csv', index=False)
df_dev_sub.to_csv('ECHR_Dataset_Sub/EN_dev_sub.csv', index=False)
df_test_sub.to_csv('ECHR_Dataset_Sub/EN_test_sub.csv', index=False)

In [5]:
# read from csv file
import pandas as pd
df_train_sub = pd.read_csv('ECHR_Dataset_Sub/EN_train_sub.csv')
df_test_sub = pd.read_csv('ECHR_Dataset_Sub/EN_test_sub.csv')
df_dev_sub = pd.read_csv('ECHR_Dataset_Sub/EN_dev_sub.csv')

In [6]:
df_train_sub['TEXT']

0     ['1. The applicant, Mr Jusuf Nezirović, is a S...
1     ['4. The applicant, Mr İbrahim Acar was born i...
2     ['6. The applicant was born in 1977 and lives ...
3     ['The applicants, Monika and Sascha Freilinger...
4     ['The applicant, Mr Pavel Janata, is a Slovaki...
                            ...                        
95    ['4. The first applicant was born in 1975 and ...
96    ['27. The relevant Articles of the Code of Cri...
97    ['4. The applicants were born in 1955, 1953 an...
98    ['6. The applicants were born in 1964 and 1963...
99    ['5. The applicant was born in 1941 and reside...
Name: TEXT, Length: 100, dtype: object

In [7]:
# print VIOLATED_ARTICLES and labels
print(df_train_sub[['VIOLATED_ARTICLES', 'LABEL']])

   VIOLATED_ARTICLES  LABEL
0                 []      0
1                 []      0
2                 []      0
3                 []      0
4                 []      0
..               ...    ...
95             ['6']      1
96             ['6']      1
97             ['6']      1
98        ['3', '5']      1
99             ['6']      1

[100 rows x 2 columns]


In [8]:
# remove all columns except text and label
df_train_sub = df_train_sub[['TEXT', 'LABEL']]
df_dev_sub = df_dev_sub[['TEXT', 'LABEL']]
df_test_sub = df_test_sub[['TEXT', 'LABEL']]

print(df_train_sub.head())

                                                TEXT  LABEL
0  ['1. The applicant, Mr Jusuf Nezirović, is a S...      0
1  ['4. The applicant, Mr İbrahim Acar was born i...      0
2  ['6. The applicant was born in 1977 and lives ...      0
3  ['The applicants, Monika and Sascha Freilinger...      0
4  ['The applicant, Mr Pavel Janata, is a Slovaki...      0


In [9]:
train_values = df_train_sub['TEXT'].values
dev_values = df_dev_sub['TEXT'].values
test_values = df_test_sub['TEXT'].values

df_train_sub['TEXT'] = ["".join(x) for x in train_values]
df_dev_sub['TEXT'] = ["".join(x) for x in dev_values]
df_test_sub['TEXT'] = ["".join(x) for x in test_values]


In [10]:
type(df_train_sub['TEXT'][0])
df_train_sub['TEXT']

0     ['1. The applicant, Mr Jusuf Nezirović, is a S...
1     ['4. The applicant, Mr İbrahim Acar was born i...
2     ['6. The applicant was born in 1977 and lives ...
3     ['The applicants, Monika and Sascha Freilinger...
4     ['The applicant, Mr Pavel Janata, is a Slovaki...
                            ...                        
95    ['4. The first applicant was born in 1975 and ...
96    ['27. The relevant Articles of the Code of Cri...
97    ['4. The applicants were born in 1955, 1953 an...
98    ['6. The applicants were born in 1964 and 1963...
99    ['5. The applicant was born in 1941 and reside...
Name: TEXT, Length: 100, dtype: object

In [11]:
# change name TEXT to text and LABEL to label
df_train_sub.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_dev_sub.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)
df_test_sub.rename(columns={'TEXT': 'text', 'LABEL': 'label'}, inplace=True)

In [12]:
# check if 0/1 class is balanced
df_train_sub['label'].value_counts()

label
0    50
1    50
Name: count, dtype: int64

In [13]:
df_test_sub['label'].value_counts()

label
1    50
0    50
Name: count, dtype: int64

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [15]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [16]:
# df_train from pandas dataframe to huggingface dataset format
from datasets import Dataset 
train_dataset = Dataset.from_pandas(df_train_sub)
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [17]:
train_dataset_tokenized = train_dataset.map(preprocess_function, batched= True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [18]:
print(train_dataset_tokenized)
print(train_dataset_tokenized[0]['input_ids'].__len__())

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
512


In [19]:
# do the same for df_dev
dev_dataset = Dataset.from_pandas(df_dev_sub)
dev_dataset_tokenized = dev_dataset.map(preprocess_function, batched=True)
dev_dataset_tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [20]:
# batch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# evaluate 
import evaluate 
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [22]:
# labels 
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2,
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='bert_echr',          # output directory
    learning_rate=2e-5,              # learning rate
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='bert_echr/logs',    # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset_tokenized,         # training dataset
    eval_dataset=dev_dataset_tokenized,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [50]:
def head_tail_tokens(dataset, head:int, tail:int):
    
    # from dataset transformers to dataframe pandas
    df = pd.DataFrame(dataset)

    # slice each input_ids value  only if head + tail < len(input_ids)
    df['input_ids'] = df['input_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for attention_mask
    df['attention_mask'] = df['attention_mask'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for token_type_ids
    df['token_type_ids'] = df['token_type_ids'].apply(lambda x: x[:head] + x[-tail:] if len(x) > head + tail else x)
    # do the same for labels

    # convert back to dataset transformers
    dataset = Dataset.from_pandas(df)

    return dataset

In [55]:
prova = head_tail_tokens(train_dataset_tokenized, 250,250)
print(prova[1]['input_ids'].__len__())

294


In [60]:
print((prova[1]['token_type_ids']))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
trainer.train()

In [None]:
trainer.save_model("bert-echr-classification")