In [1]:
from transformers import BertConfig, BertModel, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
torch.cuda.is_available()

True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

In [4]:
# Loading dataset
dataSetPath = "2011-2018-3.csv"
df = pd.read_csv(dataSetPath, sep="␞", nrows=200000)
# df = pd.read_csv(dataSetPath, sep="␞", nrows=10000)


print(df['text'].isna().sum())
df = df.dropna(subset=['text'])
print(len(df))

print(df.columns)
df = df.drop(['id', 'url', 'headline', 'datePublished'], axis=1) 

def multiLabelToSingular(labelArray):
    cleaned_text = labelArray.replace("'", "").replace(" ", "").replace("[", "").replace("]", "")
    return cleaned_text.split(",")[0]

print(df.dtypes)
df['subjects'] = df['subjects'].apply(multiLabelToSingular)
print(df['subjects'].nunique())
print(df['subjects'].unique().tolist())

df.rename({'subjects': 'labels'}, axis="columns", inplace=True)



  df = pd.read_csv(dataSetPath, sep="␞", nrows=200000)


507
199493
Index(['id', 'url', 'headline', 'text', 'subjects', 'datePublished'], dtype='object')
text        object
subjects    object
dtype: object
11
['urheilu', 'ulkomaat', 'kotimaa', 'kulttuuri', 'talous', 'luonto', 'politiikka', 'sää', 'kolumnit', 'tiede', 'oppiminen']


In [None]:
# paperin luokat:

# urheilu "18-220090",
# talous "18-204933",
# politiikka "18-220306",
# kulttuuri "18-208149",
# luonto "18-215452",
# 18-218480 - Onnettomuudet
# 18-209306 - Terveys
# 18-217206 - Rikokset
# 18-91232 - Liikenne ja kuljetus
# 18-35286 - Koulutus ja kasvatus

print(df['labels'].value_counts().sort_index(ascending=True))

le = preprocessing.LabelEncoder()
le.fit(df.labels)
df['labels'] = le.transform(df.labels)

kolumnit        218
kotimaa       54337
kulttuuri      9317
luonto         5204
oppiminen       116
politiikka     6871
sää            2790
talous        17154
tiede          1081
ulkomaat      32396
urheilu       70009
Name: labels, dtype: int64


In [6]:
n_sample = 75
df2 = df.groupby('labels').head(n_sample)
df2 = df.groupby('labels').apply(lambda x: x.sample(n=n_sample))

print(len(df2))
print(df2['labels'].value_counts().sort_index(ascending=True))


825
0     75
1     75
2     75
3     75
4     75
5     75
6     75
7     75
8     75
9     75
10    75
Name: labels, dtype: int64


In [7]:
dataset = Dataset.from_pandas(df2, preserve_index=False)
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 825
})

In [8]:

train_test_split = dataset.train_test_split(test_size=0.2)

train_valid_split = train_test_split['train'].train_test_split(test_size=0.1)

datasets = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 594
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 66
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 165
    })
})


In [9]:
unique_values = set(datasets["train"]["labels"])
print(unique_values)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


In [None]:
def tokenize_function(example):
    tokenized = tokenizer(example["text"], truncation=True)
    return tokenized

In [11]:

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets



Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 594
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 66
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 165
    })
})

In [12]:
print(tokenized_datasets["train"][0]["token_type_ids"])
print(tokenized_datasets["train"][1]["token_type_ids"])




[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

training_args = TrainingArguments(
    output_dir="test",                    
    evaluation_strategy="steps",        
    eval_steps=50,                      
    logging_dir="./logs",                
    logging_steps=50,                   
    save_steps=500,                      
    per_device_train_batch_size=8,       
    per_device_eval_batch_size=16,       
    num_train_epochs=5,                  
    save_total_limit=2,                  
    load_best_model_at_end=True,         
    metric_for_best_model="f1",          
    greater_is_better=True,              
    report_to="tensorboard",            
)




model = AutoModelForSequenceClassification.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", num_labels=11, device_map = 'cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [15]:
trainer.train()

  0%|          | 0/375 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.1289596557617188, 'eval_accuracy': 0.6666666666666666, 'eval_f1': 0.6296296296296297, 'eval_runtime': 23.0231, 'eval_samples_per_second': 2.867, 'eval_steps_per_second': 0.217, 'epoch': 0.67}
{'loss': 1.3383, 'learning_rate': 3.6666666666666666e-05, 'epoch': 1.33}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7367255687713623, 'eval_accuracy': 0.7575757575757576, 'eval_f1': 0.7512972865914043, 'eval_runtime': 19.2805, 'eval_samples_per_second': 3.423, 'eval_steps_per_second': 0.259, 'epoch': 1.33}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7671732306480408, 'eval_accuracy': 0.7727272727272727, 'eval_f1': 0.7665613352779128, 'eval_runtime': 19.3433, 'eval_samples_per_second': 3.412, 'eval_steps_per_second': 0.258, 'epoch': 2.0}
{'loss': 0.4381, 'learning_rate': 2.3333333333333336e-05, 'epoch': 2.67}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.6833579540252686, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8274799818917467, 'eval_runtime': 19.1539, 'eval_samples_per_second': 3.446, 'eval_steps_per_second': 0.261, 'epoch': 2.67}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.9282227754592896, 'eval_accuracy': 0.8181818181818182, 'eval_f1': 0.818374925727867, 'eval_runtime': 19.3877, 'eval_samples_per_second': 3.404, 'eval_steps_per_second': 0.258, 'epoch': 3.33}
{'loss': 0.122, 'learning_rate': 1e-05, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.8517866730690002, 'eval_accuracy': 0.803030303030303, 'eval_f1': 0.8050872874402287, 'eval_runtime': 19.1827, 'eval_samples_per_second': 3.441, 'eval_steps_per_second': 0.261, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.8831983208656311, 'eval_accuracy': 0.803030303030303, 'eval_f1': 0.803378810731752, 'eval_runtime': 19.4261, 'eval_samples_per_second': 3.397, 'eval_steps_per_second': 0.257, 'epoch': 4.67}
{'train_runtime': 1108.443, 'train_samples_per_second': 2.679, 'train_steps_per_second': 0.338, 'train_loss': 0.5140714613596599, 'epoch': 5.0}


TrainOutput(global_step=375, training_loss=0.5140714613596599, metrics={'train_runtime': 1108.443, 'train_samples_per_second': 2.679, 'train_steps_per_second': 0.338, 'train_loss': 0.5140714613596599, 'epoch': 5.0})

In [16]:
predictions = trainer.predict(test_dataset=tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)


  0%|          | 0/11 [00:00<?, ?it/s]

(165, 11) (165,)


In [17]:

preds = np.argmax(predictions.predictions, axis=-1)
print(preds)
print(predictions.label_ids)


[ 7  8  9  8  4  7  9  5  3  9 10 10  0  5  0  3  7  5  1  6  3  7  9  5
  2  1  6  6  9  5  7  7 10  7  8  8  9  8  8 10  8  2  2  0  1  8  9 10
  8  0  1  6  0  4 10  2  7  5  2  2  1 10 10  3  6  3  1  3  8 10  8  3
  9  6  9 10  0  2  9 10  3  0  1  2 10  6  5 10  9  1  1  5  7  3 10  2
  2  3  0  3  1  8  2  4  2  8  8  8  8  7  1  6  1  3  1  2  4 10 10 10
  9  8  0  6  6  1  1  1  2  9  1  7  2 10 10 10  2  0  1  2  7  3  4  3
  8  0 10  4  5  1 10  0  3  3  2  1  3  4  2  9  0 10  2  8  1]
[ 9  8  9  8  0  7  9  5  3  9 10 10  0  5  0  3  7  5  1  6  1  7  9  5
  2  5  0  6  7  5  7  1 10  7  8  8  9  8  8 10  8  2  2  0  5  8  9 10
  8  0  5  6  0  4 10  0  1  5  2  2  5 10 10  3  6  3  7  5  3 10  8  3
  9  6  9 10  0  2  5 10  3  0  4  2 10  9  5 10  1  5  2  5  7  1 10  9
  2  6  0  3  7  8  2  4  2  4  8  1  8  7  1  6  5  3  7  2  8 10  1 10
  6  8  0  6  6  1  8  1  2  8  1  7  2 10 10 10  2  3  1  2  5  3  8  3
  8  0 10  4  5  0 10  0  3  3  2  1  3  4  7  9  0 10  2  

In [18]:
# print(predictions.label_ids)

accuracy = accuracy_score(tokenized_datasets["test"]["labels"], preds)

f1 = f1_score(tokenized_datasets["test"]["labels"], preds, average="micro")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.7575757575757576
F1 Score: 0.7575757575757576


In [19]:
trainer.evaluate(tokenized_datasets["test"])



  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 1.0131111145019531,
 'eval_accuracy': 0.7575757575757576,
 'eval_f1': 0.7599448157878285,
 'eval_runtime': 46.5168,
 'eval_samples_per_second': 3.547,
 'eval_steps_per_second': 0.236,
 'epoch': 5.0}