In [22]:
from transformers import BertConfig, BertModel, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [2]:
torch.cuda.is_available()

True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

In [4]:
# Loading dataset
dataSetPath = "2011-2018-3.csv"
df = pd.read_csv(dataSetPath, sep="␞", nrows=1000)
# df = pd.read_csv(dataSetPath, sep="␞", nrows=10000)


print(df['text'].isna().sum())
df = df.dropna(subset=['text'])
print(len(df))

print(df.columns)
df = df.drop(['id', 'url', 'headline', 'datePublished'], axis=1) 

def multiLabelToSingular(labelArray):
    cleaned_text = labelArray.replace("'", "").replace(" ", "").replace("[", "").replace("]", "")
    return cleaned_text.split(",")[0]

print(df.dtypes)
df['subjects'] = df['subjects'].apply(multiLabelToSingular)
print(df['subjects'].nunique())
print(df['subjects'].unique().tolist())

df.rename({'subjects': 'labels'}, axis="columns", inplace=True)

le = preprocessing.LabelEncoder()
le.fit(df.labels)
df['labels'] = le.transform(df.labels)

1
999
Index(['id', 'url', 'headline', 'text', 'subjects', 'datePublished'], dtype='object')
text        object
subjects    object
dtype: object
11
['urheilu', 'ulkomaat', 'kotimaa', 'kulttuuri', 'talous', 'luonto', 'politiikka', 'sää', 'kolumnit', 'tiede', 'oppiminen']


  df = pd.read_csv(dataSetPath, sep="␞", nrows=1000)


In [5]:
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 999
})

In [6]:

train_test_split = dataset.train_test_split(test_size=0.2)

train_valid_split = train_test_split['train'].train_test_split(test_size=0.1)

datasets = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 719
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 80
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
})


In [7]:
unique_values = set(datasets["train"]["labels"])
print(unique_values)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


In [8]:
def tokenize_function(example):
    tokenized = tokenizer(text_target=example["text"], truncation=True)
    # tokenized["labels"] = [str2int[label] for label in tokenized["labels"]]
    return tokenized

In [9]:

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets

Map:   0%|          | 0/719 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 719
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [10]:
training_args = TrainingArguments("test")
model = AutoModelForSequenceClassification.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", num_labels=11, device_map = 'cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [12]:
trainer.train()

  0%|          | 0/270 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 607.6596, 'train_samples_per_second': 3.55, 'train_steps_per_second': 0.444, 'train_loss': 0.4707919368037471, 'epoch': 3.0}


TrainOutput(global_step=270, training_loss=0.4707919368037471, metrics={'train_runtime': 607.6596, 'train_samples_per_second': 3.55, 'train_steps_per_second': 0.444, 'train_loss': 0.4707919368037471, 'epoch': 3.0})

In [13]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  0%|          | 0/10 [00:00<?, ?it/s]

(80, 11) (80,)


In [16]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  0%|          | 0/25 [00:00<?, ?it/s]

(200, 11) (200,)


In [None]:
print(predictions.label_ids)
print(tokenized_datasets["test"]["labels"])


In [24]:
# print(predictions.label_ids)

accuracy = accuracy_score(tokenized_datasets["test"]["labels"], predictions.label_ids)

f1 = f1_score(tokenized_datasets["test"]["labels"], predictions.label_ids, average="micro")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 1.0
F1 Score: 1.0


In [None]:
trainer.evaluate(tokenized_datasets["test"])



  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.4032982289791107,
 'eval_runtime': 19.5051,
 'eval_samples_per_second': 10.254,
 'eval_steps_per_second': 1.282,
 'epoch': 3.0}

In [14]:

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)