### Imports

In [1]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from src.data.nordskog_data import get_data
from src.data.preprocessing import DataPreprocessor

### Loading data

In [2]:
train, test = get_data()
train.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,Ignore
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",Goal/Assist
2,På et hjørnespark langt på overtid kom avgjøre...,Goal/Assist
3,Ti minutter før pause scoret Sam Johnson sitt ...,Goal/Assist
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,Goal/Assist


In [3]:
train['label'].value_counts()

Goal/Assist       1117
quote              975
Transfer           887
irrelevant         812
Ignore             663
Player details     340
Club details       315
sjanse             300
Injuries            59
Rodt/gult kort      50
Club drama           5
Personal drama       3
Name: label, dtype: int64

### Preprocessing

In [4]:
preprocessor_train = DataPreprocessor(train)
preprocessor_train.map_nordskog_data(numeric=False)
preprocessor_train.limit_number_of_targets_to_5_and_merge(numeric=False)
preprocessor_train.remove_extra_spaces_from_text()
preprocessor_train.remove_paragraphs_over_65_words()
preprocessed_training_data = preprocessor_train.data.copy()
preprocessed_training_data.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,Ignore
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",Goal/Assist
2,På et hjørnespark langt på overtid kom avgjøre...,Goal/Assist
3,Ti minutter før pause scoret Sam Johnson sitt ...,Goal/Assist
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,Goal/Assist


In [5]:
preprocessed_training_data['label'].value_counts()

Goal/Assist    1402
Ignore         1316
Irrelevant      923
Quote           900
Transfer        871
Name: label, dtype: int64

In [6]:
train_texts, validation_texts, train_labels, validation_labels = train_test_split(preprocessed_training_data['text'],
                                                                                  preprocessed_training_data['label'],
                                                                                  test_size=0.2)

### Modelling

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-large", model_max_lenght=512)

In [8]:
train_encodings = tokenizer(train_texts.values.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(validation_texts.values.tolist(), truncation=True, padding=True, max_length=512)

In [9]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))
train_dataset

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(95,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(95,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(95,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.string, name=None))>

In [10]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('NbAiLab/nb-bert-large', num_labels = 5)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]) # can also use any keras loss fn
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16, validation_data=val_dataset.batch(16))

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


RuntimeError: pybind11::error_already_set: MISMATCH of original and normalized active exception types: ORIGINAL UnimplementedError REPLACED BY KeyboardInterrupt: <EMPTY MESSAGE>

At:
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py(445): __init__
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\execute.py(52): quick_execute
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\monomorphic_function.py(378): call
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\monomorphic_function.py(1745): _call_flat
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\tracing_compiler.py(134): __call__
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\polymorphic_function.py(945): _call
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\eager\polymorphic_function\polymorphic_function.py(880): __call__
  C:\Users\Eirik\anaconda3\lib\site-packages\tensorflow\python\util\traceback_utils.py(150): error_handler
  C:\Users\Eirik\anaconda3\lib\site-packages\keras\engine\training.py(1650): fit
  C:\Users\Eirik\anaconda3\lib\site-packages\keras\utils\traceback_utils.py(65): error_handler
  C:\Users\Eirik\AppData\Local\Temp/ipykernel_24820/3038220380.py(8): <module>
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3444): run_code
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3364): run_ast_nodes
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3172): run_cell_async
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\async_helpers.py(68): _pseudo_sync_runner
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(2947): _run_cell
  C:\Users\Eirik\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(2901): run_cell
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\zmqshell.py(533): run_cell
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\ipkernel.py(353): do_execute
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\kernelbase.py(648): execute_request
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\kernelbase.py(353): dispatch_shell
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\kernelbase.py(446): process_one
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\kernelbase.py(457): dispatch_queue
  C:\Users\Eirik\anaconda3\lib\asyncio\events.py(80): _run
  C:\Users\Eirik\anaconda3\lib\asyncio\base_events.py(1890): _run_once
  C:\Users\Eirik\anaconda3\lib\asyncio\base_events.py(596): run_forever
  C:\Users\Eirik\anaconda3\lib\site-packages\tornado\platform\asyncio.py(199): start
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel\kernelapp.py(677): start
  C:\Users\Eirik\anaconda3\lib\site-packages\traitlets\config\application.py(846): launch_instance
  C:\Users\Eirik\anaconda3\lib\site-packages\ipykernel_launcher.py(16): <module>
  C:\Users\Eirik\anaconda3\lib\runpy.py(87): _run_code
  C:\Users\Eirik\anaconda3\lib\runpy.py(197): _run_module_as_main


In [None]:
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()