In [1]:
%pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.31.0 (from simpletransformers)
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers (from simpletransformers)

In [2]:
import os
import numpy as np
import pandas as pd
import sklearn


from sklearn.model_selection import train_test_split, KFold
from simpletransformers.classification import ClassificationModel
from google.colab import files, drive

In [3]:
# load files into colab

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving article-data-test.csv to article-data-test.csv
Saving article-data-train-and-val.csv to article-data-train-and-val.csv
User uploaded file "article-data-test.csv" with length 243200 bytes
User uploaded file "article-data-train-and-val.csv" with length 2179910 bytes


In [4]:
df_train = pd.read_csv('article-data-train-and-val.csv', index_col=0)
df_test = pd.read_csv('article-data-test.csv', index_col=0)

BERT_MODEL = "TurkuNLP/bert-base-finnish-cased-v1"
N_EPOCHS = 4
BATCH_SIZE = 8
LEARNING_RATE = 4e-5

BERT_ARGS = {
    "output_dir":'./results',
    "reprocess_input_data": True,
    "fp16":False,
    "num_train_epochs": N_EPOCHS,
    "overwrite_output_dir":True,
    "evaluate_during_training": False,
    "train_batch_size": 8,
    "eval_batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE
}

N_LABELS = len(set(df_train.pred_class.values))

In [5]:
print(f"train sample shape: {df_train.shape}")
print(f"test sample shape: {df_test.shape}")

train sample shape: (349, 2)
test sample shape: (39, 2)


In [6]:
model = ClassificationModel(
    "bert",
    BERT_MODEL,
    num_labels=N_LABELS,
    args=BERT_ARGS,
)
model.train_model(df_train)
result, model_outputs, wrong_predictions = model.eval_model(df_test, acc=sklearn.metrics.accuracy_score)
print(f"Prediction Accuracy: {result['acc']:.2f}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/424k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/816k [00:00<?, ?B/s]



  0%|          | 0/349 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/44 [00:00<?, ?it/s]



  0%|          | 0/39 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]

Prediction Accuracy: 0.90


Model has an prediction of accuracy of 90%.

Next, lets train the final model with all data available.

In [7]:
df = pd.concat([df_test, df_train]).reset_index(drop=True)

In [8]:
print(f"full data shape: {df.shape}")

full data shape: (388, 2)


In [9]:
BERT_ARGS_FINAL = {
    "output_dir":'./results-final',
    "reprocess_input_data": True,
    "fp16":False,
    "num_train_epochs": N_EPOCHS,
    "overwrite_output_dir":True,
    "evaluate_during_training": False,
    "train_batch_size": 8,
    "eval_batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE
}

model_final = ClassificationModel(
    "bert",
    BERT_MODEL,
    num_labels=N_LABELS,
    args=BERT_ARGS_FINAL,
)
model_final.train_model(df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/388 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/49 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/49 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/49 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/49 [00:00<?, ?it/s]

(196, 0.3954465544084087)

Finally, lets save the results and final model from Google Colab to Google Drive.

In [10]:
!zip -r ./model.zip ./results-final

  adding: results-final/ (stored 0%)
  adding: results-final/vocab.txt (deflated 55%)
  adding: results-final/checkpoint-98-epoch-2/ (stored 0%)
  adding: results-final/checkpoint-98-epoch-2/vocab.txt (deflated 55%)
  adding: results-final/checkpoint-98-epoch-2/training_args.bin (deflated 50%)
  adding: results-final/checkpoint-98-epoch-2/special_tokens_map.json (deflated 42%)
  adding: results-final/checkpoint-98-epoch-2/model.safetensors (deflated 7%)
  adding: results-final/checkpoint-98-epoch-2/optimizer.pt (deflated 29%)
  adding: results-final/checkpoint-98-epoch-2/config.json (deflated 51%)
  adding: results-final/checkpoint-98-epoch-2/scheduler.pt (deflated 56%)
  adding: results-final/checkpoint-98-epoch-2/model_args.json (deflated 62%)
  adding: results-final/checkpoint-98-epoch-2/tokenizer_config.json (deflated 76%)
  adding: results-final/checkpoint-98-epoch-2/tokenizer.json (deflated 71%)
  adding: results-final/training_args.bin (deflated 50%)
  adding: results-final/spec

In [11]:
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
!cp model.zip /content/drive/MyDrive

In [13]:
drive.flush_and_unmount()