In [1]:
from transformers import BertConfig, BertModel, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
torch.cuda.is_available()

True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

In [4]:
# Loading dataset
dataSetPath = "yle_data/2011-2018-SubjectsInPaper-1.csv"
df = pd.read_csv(dataSetPath, sep="␞", nrows=200000)


print(df['text'].isna().sum(), "invalid data")
df = df.dropna(subset=['text'])
print(len(df), "valid data")

print(df.columns)
df = df.drop(['id', 'url', 'headline', 'datePublished'], axis=1) 


def multiLabelToSingular(labelArray):
    cleaned_text = labelArray.replace("'", "").replace(" ", "").replace("[", "").replace("]", "")
    return cleaned_text.split(",")[0]


# print(df.dtypes)
df['subjects'] = df['subjects'].apply(multiLabelToSingular)

print(df['subjects'].nunique(), "categories")
print(df['subjects'].unique().tolist())

df.rename({'subjects': 'labels'}, axis="columns", inplace=True)



  df = pd.read_csv(dataSetPath, sep="␞", nrows=200000)


612 invalid data
199388 valid data
Index(['id', 'url', 'headline', 'text', 'subjects', 'datePublished'], dtype='object')
10 categories
['urheilu', 'kulttuuri', 'talous', 'luonto', 'Onnettomuudet', 'politiikka', 'Rikokset', 'Liikenne_ja_kuljetus', 'Koulutus_ja_kasvatus', 'Terveys']


In [5]:
# paperin luokat:

# urheilu "18-220090",
# talous "18-204933",
# politiikka "18-220306",
# kulttuuri "18-208149",
# luonto "18-215452",
# 18-218480 - Onnettomuudet
# 18-209306 - Terveys
# 18-217206 - Rikokset
# 18-91232 - Liikenne ja kuljetus
# 18-35286 - Koulutus ja kasvatus

print(df['labels'].value_counts().sort_index(ascending=True))

le = preprocessing.LabelEncoder()
le.fit(df.labels)
df['labels'] = le.transform(df.labels)

Koulutus_ja_kasvatus    10753
Liikenne_ja_kuljetus    13554
Onnettomuudet           10717
Rikokset                 8166
Terveys                  8600
kulttuuri               13794
luonto                  10994
politiikka              12961
talous                  24427
urheilu                 85422
Name: labels, dtype: int64


In [6]:
n_sample = 75
df_balanced = df.groupby('labels').head(n_sample)
df_balanced = df.groupby('labels').apply(lambda x: x.sample(n=n_sample))

print(len(df_balanced), "documents")
print(df_balanced['labels'].value_counts().sort_index(ascending=True))


750 documents
0    75
1    75
2    75
3    75
4    75
5    75
6    75
7    75
8    75
9    75
Name: labels, dtype: int64


In [7]:
dataset = Dataset.from_pandas(df_balanced, preserve_index=False)
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 750
})

In [8]:

train_test_split = dataset.train_test_split(test_size=0.2)

train_valid_split = train_test_split['train'].train_test_split(test_size=0.1)

datasets = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 540
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 60
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 150
    })
})


In [9]:
unique_values = set(datasets["train"]["labels"])
print(unique_values)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [10]:
def tokenize_function(example):
    tokenized = tokenizer(example["text"], truncation=True)
    return tokenized

In [11]:

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets



Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 540
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 60
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 150
    })
})

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

training_args = TrainingArguments(
    output_dir="./logs",                    
    evaluation_strategy="steps",        
    eval_steps=50,                      
    logging_dir="./logs",                
    logging_steps=50,                   
    save_steps=500,                      
    per_device_train_batch_size=8,       
    per_device_eval_batch_size=16,       
    num_train_epochs=5,                  
    save_total_limit=2,                  
    load_best_model_at_end=True,         
    metric_for_best_model="f1",          
    greater_is_better=True,              
    report_to="tensorboard",            
)





In [15]:
model = AutoModelForSequenceClassification.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", num_labels=11, device_map = 'cuda')


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [16]:
trainer.train()

  0%|          | 0/340 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.5732, 'learning_rate': 4.2647058823529415e-05, 'epoch': 0.74}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7891043424606323, 'eval_accuracy': 0.7833333333333333, 'eval_f1': 0.7833958633958635, 'eval_runtime': 7.9984, 'eval_samples_per_second': 7.502, 'eval_steps_per_second': 0.5, 'epoch': 0.74}
{'loss': 0.6059, 'learning_rate': 3.529411764705883e-05, 'epoch': 1.47}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6086699962615967, 'eval_accuracy': 0.85, 'eval_f1': 0.846098914845045, 'eval_runtime': 7.7981, 'eval_samples_per_second': 7.694, 'eval_steps_per_second': 0.513, 'epoch': 1.47}
{'loss': 0.3099, 'learning_rate': 2.7941176470588236e-05, 'epoch': 2.21}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7717382311820984, 'eval_accuracy': 0.7666666666666667, 'eval_f1': 0.7631734006734007, 'eval_runtime': 7.9522, 'eval_samples_per_second': 7.545, 'eval_steps_per_second': 0.503, 'epoch': 2.21}
{'loss': 0.1712, 'learning_rate': 2.058823529411765e-05, 'epoch': 2.94}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.0066031217575073, 'eval_accuracy': 0.7833333333333333, 'eval_f1': 0.7792087542087541, 'eval_runtime': 7.7168, 'eval_samples_per_second': 7.775, 'eval_steps_per_second': 0.518, 'epoch': 2.94}
{'loss': 0.0708, 'learning_rate': 1.323529411764706e-05, 'epoch': 3.68}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6378504037857056, 'eval_accuracy': 0.85, 'eval_f1': 0.8481899192425508, 'eval_runtime': 8.2213, 'eval_samples_per_second': 7.298, 'eval_steps_per_second': 0.487, 'epoch': 3.68}
{'loss': 0.0491, 'learning_rate': 5.882352941176471e-06, 'epoch': 4.41}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.8179947137832642, 'eval_accuracy': 0.85, 'eval_f1': 0.847037037037037, 'eval_runtime': 7.8976, 'eval_samples_per_second': 7.597, 'eval_steps_per_second': 0.506, 'epoch': 4.41}
{'train_runtime': 861.8477, 'train_samples_per_second': 3.133, 'train_steps_per_second': 0.395, 'train_loss': 0.4137456865871654, 'epoch': 5.0}


TrainOutput(global_step=340, training_loss=0.4137456865871654, metrics={'train_runtime': 861.8477, 'train_samples_per_second': 3.133, 'train_steps_per_second': 0.395, 'train_loss': 0.4137456865871654, 'epoch': 5.0})

In [17]:
predictions = trainer.predict(test_dataset=tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)


  0%|          | 0/10 [00:00<?, ?it/s]

(150, 11) (150,)


In [18]:

preds = np.argmax(predictions.predictions, axis=-1)
print(preds)
print(predictions.label_ids)


[4 7 1 1 9 2 4 4 7 1 0 5 2 8 4 3 1 2 9 7 8 7 1 5 0 0 2 4 5 9 1 7 0 6 2 9 8
 6 2 6 0 1 2 6 8 7 3 3 4 6 3 3 2 9 9 8 5 7 7 8 4 8 4 7 7 0 4 8 2 7 4 3 2 2
 7 8 1 5 6 2 9 4 7 2 9 7 8 0 7 9 5 4 6 6 7 0 5 2 9 7 0 8 9 7 9 2 2 2 4 7 3
 6 4 3 3 0 3 5 9 2 4 5 1 6 7 9 7 0 2 9 9 6 2 1 1 0 9 0 6 3 4 4 4 9 0 7 0 2
 6 2]
[4 7 1 1 9 2 4 4 7 1 0 5 2 8 4 3 1 2 9 7 8 7 4 5 0 5 2 4 5 9 1 7 0 6 2 9 5
 6 2 6 0 1 2 6 8 7 3 3 4 6 3 3 2 9 9 3 5 8 7 8 4 8 4 7 8 0 4 8 6 7 4 3 2 2
 7 8 1 5 6 1 9 4 7 2 9 7 8 0 7 9 5 4 6 6 0 0 5 2 9 7 0 8 9 7 0 2 3 2 4 8 6
 6 4 3 3 0 8 5 9 5 4 5 1 6 3 9 7 0 2 9 9 6 2 1 1 0 9 0 6 3 4 4 7 9 0 7 0 2
 6 2]


In [19]:
trainer.evaluate(tokenized_datasets["test"])



  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.5493655204772949,
 'eval_accuracy': 0.8866666666666667,
 'eval_f1': 0.8854765911974888,
 'eval_runtime': 21.5159,
 'eval_samples_per_second': 6.972,
 'eval_steps_per_second': 0.465,
 'epoch': 5.0}

In [111]:
# Loading dataset
test_corpus_data = "test_corpus_data.csv"
df_corpus = pd.read_csv(test_corpus_data, sep="|")
# df_corpus = df_corpus.drop(['text_better_ocr'], axis=1) 

df_corpus.head(3)


Unnamed: 0,text_modernized,text_better_ocr,labels
0,Kuulumiset ulkomailta ovat olleet hyvin vähäis...,Kuulumiset ulkomailta owat olleet hywin wähäi-...,Rikokset
1,"Amerikasta kerrotaan, että New Yorkiin, Amerik...","Amerikasta kerrotaan, että NewYork'iin Amerika...",Liikenne ja kuljetus
2,Kurmijärveltä kirjoitetaan meille joulukuun lo...,Kurmijärweltä kirjoitetaan meille lopulla wii-...,Rikokset


In [112]:
df_corpus.rename({'text_better_ocr': 'text'}, axis="columns", inplace=True)


In [113]:
le.fit(df_corpus.labels)
df_corpus['labels'] = le.transform(df_corpus.labels)
print(df_corpus.labels)


0    1
1    0
2    1
3    2
4    2
5    2
6    2
Name: labels, dtype: int32


In [114]:
df_corpus = Dataset.from_pandas(df_corpus, preserve_index=False)

In [115]:
tokenized_corpus = df_corpus.map(tokenize_function, batched=True)
tokenized_corpus



Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Dataset({
    features: ['text_modernized', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7
})

In [116]:
predictions_corpus = trainer.predict(test_dataset=tokenized_corpus)
preds_corpus = np.argmax(predictions_corpus.predictions, axis=-1)
print(preds_corpus)
print(predictions_corpus.label_ids)

# labelit ovat väärin enkoodattuna korpuksen testidatan kanssa. 2 ja 7 vastaavat toisiaan 

  0%|          | 0/1 [00:00<?, ?it/s]

[0 6 6 7 7 7 7]
[1 0 1 2 2 2 2]


In [49]:
model_save_path = "./t_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)


('./t_model\\tokenizer_config.json',
 './t_model\\special_tokens_map.json',
 './t_model\\vocab.txt',
 './t_model\\added_tokens.json',
 './t_model\\tokenizer.json')