### Imports

In [1]:
from src.data.nordskog_data import get_data
from src.data.preprocessing import DataPreprocessor
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import Dataset, DatasetDict
import ray.data
from ray.data.preprocessors import BatchMapper
import pandas as pd

In [2]:
from huggingface_hub import notebook_login

notebook_login()
     

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Loading data

In [3]:
train, test = get_data()
train.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,Ignore
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",Goal/Assist
2,På et hjørnespark langt på overtid kom avgjøre...,Goal/Assist
3,Ti minutter før pause scoret Sam Johnson sitt ...,Goal/Assist
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,Goal/Assist


In [4]:
train['label'].value_counts()

Goal/Assist       1117
quote              975
Transfer           887
irrelevant         812
Ignore             663
Player details     340
Club details       315
sjanse             300
Injuries            59
Rodt/gult kort      50
Club drama           5
Personal drama       3
Name: label, dtype: int64

### Preprocessing

In [5]:
preprocessor_train = DataPreprocessor(train)
preprocessor_train.map_nordskog_data(numeric=True)
preprocessor_train.limit_number_of_targets_to_5_and_merge(numeric=True)
preprocessor_train.remove_extra_spaces_from_text()
preprocessor_train.remove_paragraphs_over_65_words()
preprocessed_training_data = preprocessor_train.data.copy()
preprocessed_training_data.head()

Unnamed: 0,text,label
0,Vålerenga - Rosenborg 2-3,4
1,"Sam Johnson ga vertene ledelsen, men Jonathan ...",0
2,På et hjørnespark langt på overtid kom avgjøre...,0
3,Ti minutter før pause scoret Sam Johnson sitt ...,0
4,Vålerenga holdt 1-0-ledelsen bare frem til sis...,0


In [6]:
preprocessed_training_data['label'].value_counts()

0    1402
4    1316
3     923
1     900
2     871
Name: label, dtype: int64

In [7]:
train_texts, validation_texts, train_labels, validation_labels = train_test_split(preprocessed_training_data['text'],
                                                                                  preprocessed_training_data['label'],
                                                                                  test_size=0.2)

In [8]:
train_split, validation_split = train_test_split(preprocessed_training_data, test_size=0.2)

In [9]:
train_dataset = Dataset.from_pandas(train_split)
validation_dataset = Dataset.from_pandas(validation_split)
split_dict = {'train': train_dataset, 'validation': validation_dataset}
datasets = DatasetDict(split_dict)
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4329
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1083
    })
})

In [None]:
ray_datasets = ray.data.from_huggingface(datasets)
ray_datasets

### Modelling

In [None]:
model_checkpoint = 'NbAiLab/nb-bert-large'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, model_max_length=512)

In [None]:
def preprocess_function(examples):

    return tokenizer(examples['text'], truncation=True)

encoded_dataset = datasets.map(preprocess_function, batched=True)
encoded_dataset

In [None]:
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [14]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]
task = 'text_classification'
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [16]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

OSError: Tried to clone https://huggingface.co/hellund/nb-bert-large-finetuned-text_classification in an unrelated git repository.
If you believe this is an error, please add a remote with the following URL: https://huggingface.co/hellund/nb-bert-large-finetuned-text_classification.
Local path has its origin defined as: https://huggingface.co/hellun/nb-bert-large-finetuned-text_classification


In [17]:
!--overwrite_output_dir

'--overwrite_output_dir' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
model = TFBertForSequenceClassification.from_pretrained('NbAiLab/nb-bert-large', num_labels = 5)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08, clipnorm=1.0)
METRICS = [
      tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=METRICS)
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=4,
                    batch_size=16, validation_data=val_dataset.batch(16))

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()