In [16]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, pipeline
import torch
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, RobertaTokenizer
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

In [17]:
stop_words = stopwords.words('english')
test_df = pd.read_csv('../data/csv/test.csv')

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [19]:
def encode(examples):
    return tokenizer( examples["text"],
                     truncation=True, 
                     padding=True)

In [20]:
test_df['text'] = test_df['text'].str.lower().str.split().apply(lambda x: [item for item in x if item not in stop_words]).apply(lambda x: " ".join(x)).replace('\d+', '', regex=True)
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(encode, batched=True)

Map:   0%|          | 0/2998 [00:00<?, ? examples/s]

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("../models/ECHR/distilbert_echr_model/removed_stopwords/accuracy/checkpoint-888", num_labels=2)
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [23]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

In [24]:
results = pipe(tokenizer.decode(case, clean_up_tokenization_spaces=True, skip_special_tokens=True) for case in test_dataset['input_ids'])

In [25]:
predictions = list(result['label'] for result in results)

In [26]:
report = classification_report(list('NON_VIOLATED' if outcome==0 else 'VIOLATED' for outcome in test_dataset['labels']), predictions)

In [27]:
print(report)

              precision    recall  f1-score   support

NON_VIOLATED       0.90      0.61      0.73      1024
    VIOLATED       0.83      0.97      0.89      1974

    accuracy                           0.84      2998
   macro avg       0.86      0.79      0.81      2998
weighted avg       0.85      0.84      0.83      2998



In [28]:
test_dataset['text'][0]

'. list applicants relevant details applications set appended table. . applicants alleged, particular, receive adequate medical care detention. applicants also raised complaints convention.'