### Imports

In [1]:
from src.data.nordskog_data import get_data
from src.data.preprocessing import DataPreprocessor
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertForSequenceClassification
import matplotlib.pyplot as plt
import tensorflow as tf

### Loading data

In [2]:
train, test = get_data()
train.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,Ignore
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",Goal/Assist
2,På et hjørnespark langt på overtid kom avgjøre...,Goal/Assist
3,Ti minutter før pause scoret Sam Johnson sitt ...,Goal/Assist
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,Goal/Assist


In [3]:
train['label'].value_counts()

Goal/Assist       1117
quote              975
Transfer           887
irrelevant         812
Ignore             663
Player details     340
Club details       315
sjanse             300
Injuries            59
Rodt/gult kort      50
Club drama           5
Personal drama       3
Name: label, dtype: int64

### Preprocessing

In [4]:
preprocessor_train = DataPreprocessor(train)
preprocessor_train.map_nordskog_data(numeric=True)
preprocessor_train.limit_number_of_targets_to_5_and_merge(numeric=True)
preprocessor_train.remove_extra_spaces_from_text()
preprocessor_train.remove_paragraphs_over_65_words()
preprocessed_training_data = preprocessor_train.data.copy()
preprocessed_training_data.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,4
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",0
2,På et hjørnespark langt på overtid kom avgjøre...,0
3,Ti minutter før pause scoret Sam Johnson sitt ...,0
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,0


In [5]:
preprocessed_training_data['label'].value_counts()

0    1402
4    1316
3     923
1     900
2     871
Name: label, dtype: int64

In [6]:
train_texts, validation_texts, train_labels, validation_labels = train_test_split(preprocessed_training_data['text'],
                                                                                  preprocessed_training_data['label'],
                                                                                  test_size=0.2)

### Modelling

In [7]:
tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-large", model_max_lenght=512)

In [8]:
train_encodings = tokenizer(train_texts.values.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(validation_texts.values.tolist(), truncation=True, padding=True, max_length=512)

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    validation_labels
))
train_dataset

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(95,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(95,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(95,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [10]:
model = TFBertForSequenceClassification.from_pretrained('NbAiLab/nb-bert-large', num_labels = 5)
model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
  7/271 [..............................] - ETA: 2:19:25 - loss: 1.6375 - accuracy: 0.2321

KeyboardInterrupt: 

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08, clipnorm=1.0)
METRICS = [
      tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=METRICS)
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=4,
                    batch_size=16, validation_data=val_dataset.batch(16))

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()