# GPU, installing packages and login to WANDB

In [None]:
# Initialize GPU
!nvidia-smi

Mon Nov 28 10:16:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Installing required packages

In [None]:
!pip install -q transformers transformers-interpret datasets evaluate wandb tensorflow spacy spacy_langdetect

[K     |████████████████████████████████| 5.5 MB 4.9 MB/s 
[K     |████████████████████████████████| 45 kB 4.3 MB/s 
[K     |████████████████████████████████| 451 kB 89.4 MB/s 
[K     |████████████████████████████████| 72 kB 1.7 MB/s 
[K     |████████████████████████████████| 1.9 MB 62.5 MB/s 
[K     |████████████████████████████████| 182 kB 84.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 52.3 MB/s 
[K     |████████████████████████████████| 1.4 MB 73.2 MB/s 
[K     |████████████████████████████████| 793 kB 84.3 MB/s 
[K     |████████████████████████████████| 1.6 MB 63.2 MB/s 
[K     |████████████████████████████████| 115 kB 86.2 MB/s 
[K     |████████████████████████████████| 212 kB 90.4 MB/s 
[K     |████████████████████████████████| 127 kB 80.2 MB/s 
[K     |████████████████████████████████| 168 kB 89.2 MB/s 
[K     |████████████████████████████████| 182 kB 75.6 MB/s 
[K     |████████████████████████████████| 62 kB 1.4 MB/s 
[K     |██████████████████████

# Importing packages, data and model

In [None]:
from datasets import load_dataset, load_metric #load_dataset will cache the dataset to avoid downloading it again the next time you run this cell.
import datasets as datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import pandas as pd

# Loading data from HuggingFace
dataset_1 = datasets.load_dataset('YOUR_DATA_SET') 

# Loading data from local file
path = "PATH_TO_YOUR_DATASET"
dataset_2 = pd.read_csv(path)

# Loading model and tokenizer
model_name = "THE_MODEL_YOU_WISH_TO_INVESTIGATE" # E.g. "flax-community/roberta-base-danish"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name) # beware that "AutoModelForSequenceClassification" will automatically add an empty linear layer on top of the model, we don't need to do that manually

Some weights of the model checkpoint at flax-community/roberta-base-danish were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at flax-community/roberta-base-danish and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight',

# Preprocessing
This step varies from dataset to dataset. Do not follow these steps unless they fit to your type of data. The below steps were conducted on a dataset containing Twitter tweets in Danish.

### Removing unwanted words

In [None]:
# Creating function to remove unwanted words
def remove_words(sentence, unwanted_words_list):
    tokens = sentence.split(" ")
    tokens_filtered= [word for word in tokens if not word in unwanted_words_list]
    return (" ").join(tokens_filtered)

# Creating list of words to remove from the data
stopwords = ["link", "rt", "amp", "@USER", "[LINK]"]

YOUR_DF.COLUMN_WITH_TEXT = [remove_mystopwords(sentence, stopwords) for sentence in YOUR_DF.COLUMN_WITH_TEXT]

### Removing non-Danish sentences

In [None]:
no_eng = 1
no_below_4 = 1
no_ttr_below_3 = 1

if no_eng == 1:
  import spacy
  from spacy.language import Language
  from spacy_langdetect import LanguageDetector
  import numpy as np
  import pandas as pd

  def get_lang_detector(nlp, name):
    return LanguageDetector()
    
  # loading the language model instance that will be used for language detection
  nlp = spacy.load("en_core_web_sm")
  Language.factory("language_detector", func=get_lang_detector)
  nlp.add_pipe('language_detector', last=True)

  # Applying the language detection to the data
  data = [nlp(text_i)._.language for i, text_i in enumerate(YOUR_DF['COLUMN_WITH_TEXT'])]

### Split data into test, training and validation

In [None]:
from datasets import concatenate_datasets, dataset_dict, Dataset

#### combine datasets
#dataset_combined = concatenate_datasets([raw_dataset['val'], raw_dataset['train'], raw_dataset['test']])

##### 60% train, 40% test
train_test = raw_dataset.train_test_split(test_size=0.4, seed = 42) # seed when splitting data is fairly crucial when comparing different models, to make sure they get the same test and training data.
# 20% validation, 20% test
test_valid = train_test['test'].train_test_split(test_size=0.5, seed = 42)
# combining into test 60%, test 20%, val 20%
dataset_recombined = datasets.DatasetDict({
    'train': train_test['train'],
    'valid': test_valid['train'],
    'test': test_valid['test']})

### Converting dataset into dataset dict
A format which works well with TransformersAPI: https://huggingface.co/docs/datasets/v1.1.1/loading_datasets.html

In [None]:
from datasets import dataset_dict, Dataset
raw_dataset = Dataset.from_dict(df)
raw_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 3806
})


### Tokenize data

In [None]:
# defining a function to tokenize the text and translate all labels into integers intead of strings
def tokenize_function(example):
  tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
  tokens['label'] = labels_cl.str2int(example['label'])
  return tokens

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=raw_dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Evaluation metrics

In [None]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define early stopping function
Early stopping stops the fune-tuning process when validation loss hasn't improved for n(early_stopping_patience) epochs

In [None]:
# Adjust early_stopping_patience according to your application
early_stop = EarlyStoppingCallback(early_stopping_patience = 8)

# Define hyperparameters 

In [None]:
batch_size = 128 # stating batch size
epochs = 200
learning_rate = 2e-5

# Initialize Weights and Biases (WANDB)

In [None]:
import wandb
wandb.login() # enter your WANDB credentials
wandb.init(project="YOUR_WANDB_PROJECT")

wandb.config.dropout = 0.2

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjorgenhw[0m ([33mbachelor_thesis_cogsci[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Hyperparameter tuning

Framework: [Optuna](https://optuna.org/)

In [None]:
!pip install -q optuna ray[tune]

[K     |████████████████████████████████| 348 kB 4.4 MB/s 
[K     |████████████████████████████████| 59.1 MB 107.1 MB/s 
[K     |████████████████████████████████| 81 kB 12.6 MB/s 
[K     |████████████████████████████████| 209 kB 92.8 MB/s 
[K     |████████████████████████████████| 78 kB 9.4 MB/s 
[K     |████████████████████████████████| 50 kB 7.8 MB/s 
[K     |████████████████████████████████| 147 kB 90.7 MB/s 
[K     |████████████████████████████████| 112 kB 63.0 MB/s 
[K     |████████████████████████████████| 8.8 MB 71.8 MB/s 
[K     |████████████████████████████████| 125 kB 84.1 MB/s 
[K     |████████████████████████████████| 468 kB 62.4 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [None]:
def model_init():
    return model #model = the model you defined earlier

**Training parameters**

In [None]:
training_args = TrainingArguments(output_dir=model_name, 
                                  evaluation_strategy = "epoch",
                                  save_strategy = "epoch", 
                                  num_train_epochs = epochs, 
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  learning_rate = learning_rate,
                                  weight_decay=0.01,
                                  load_best_model_at_end=True,
                                  report_to="wandb",
                                  save_total_limit = 2)

**Passing the information to the trainer**

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [early_stop]
)

**Define the hyperparameters you wish to optimze along with the ranges in which you wish to search for optimal values**

Read more here: https://huggingface.co/docs/transformers/hpo_train

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay": trial.suggest_loguniform('weight_decay', 1e-4, 1e-2)
    }

Read documentation for Optunas samplers [here](https://optuna.readthedocs.io/en/stable/reference/samplers/index.html#:~:text=Base-,class,-for%20samplers).

Read documentation for Optunas pruners [here](https://optuna.readthedocs.io/en/stable/reference/pruners.html).

In [None]:
import optuna
from optuna.samplers import TPESampler

sampler = optuna.samplers.TPESampler()
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)

best_run = trainer.hyperparameter_search(
    n_trials=10, 
    direction="minimize", 
    hp_space=my_hp_space, 
    backend = "optuna",
    sampler = sampler,
    pruner = pruner
    )

In [None]:
# calling best run
best_run

# Initialize fine-tuning
...  with the parameters learned from hyperparameter search

A lot of things are happening in the chunk below. First of all, beware that the chunk will fine-tune the model 10 times since each training is stochastic. We recommend taking the mean predictions of ten runs to get the most accurate estimates of the performance of the model.

In [None]:
# Defining the trainer
for i in range(10):
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["test"],
      compute_metrics=compute_metrics,
      callbacks = [early_stop])
  # Telling the trainer to use the parameters learned from hyperparameter tuning
  for n, v in best_run.hyperparameters.items():
      setattr(trainer.args, n, v) # for running the experiment with the best hyperparameters from the hyperparameters search

  trainer.train() 

  trainer.evaluate()

  import tensorflow as tf

  # creating model predictions for the validation data
  predictions_val = trainer.predict(tokenized_datasets["valid"])

  # choosing the prediction that has the highest probability 
  preds_val_val = np.argmax(predictions_val.predictions, axis=-1)

  # calculating the probabilities instead of logits from each
  predictions_probabilities = tf.nn.softmax(predictions_val.predictions)

  def compute_metrics_end(preds, refs):
      metric0 = evaluate.load("accuracy")
      metric1 = evaluate.load("precision")
      metric2 = evaluate.load("recall")
      metric3 = evaluate.load("f1")
      
      #logits, labels = eval_pred
      #predictions = np.argmax(logits, axis=-1)
      accuracy = metric0.compute(predictions=preds, references=refs)["accuracy"]
      precision = metric1.compute(predictions=preds, references=refs, average="weighted")["precision"]
      recall = metric2.compute(predictions=preds, references=refs, average="weighted")["recall"]
      f1 = metric3.compute(predictions=preds, references=refs, average="weighted")["f1"]
      return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

  metrics_val = compute_metrics_end(preds=preds_val_val, refs=predictions_val.label_ids)

  import tensorflow as tf

  # creating model predictions for the validation data
  predictions_test = trainer.predict(tokenized_datasets["test"])

  # choosing the prediction that has the highest probability 
  preds_test_test = np.argmax(predictions_test.predictions, axis=-1)

  # calculating the probabilities instead of logits from each
  predictions_probabilities_test = tf.nn.softmax(predictions_test.predictions)

  metrics_test = compute_metrics_end(preds=preds_test_test, refs=predictions_test.label_ids)

  import pandas as pd

  data = {'Predicted Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in preds_val_val],
          'True Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in predictions_val.label_ids],
          'Misclassification': ["TRUE" if preds_val_val[i] == predictions_val.label_ids[i] else 'MISS' for i, val in enumerate(preds_val_val)],
          'Text': dataset_recombined['valid']['text'],
          'Logit Values': [str(i) for i in predictions_val.predictions],
          'Probabilities': [str(i) for i in np.asarray(predictions_probabilities)]}
  df = pd.DataFrame(data)
  df_metrics_val = pd.DataFrame(metrics_val.items())
  df_metrics_test = pd.DataFrame(metrics_test.items())

  df.to_csv(f"/content/drive/MyDrive/BA_data/nbailab/df_classification_report{i}.csv")
  df_metrics_val.to_csv(f"/content/drive/MyDrive/BA_data/nbailab/df_classification_metrics_val{i}.csv")
  df_metrics_test.to_csv(f"/content/drive/MyDrive/BA_data/nbailab/df_classification_metrics_test{i}.csv")

wandb.finish()