<a href="https://colab.research.google.com/github/krfis/rd23/blob/main/RD23_Training_with_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install simpletransformers
!pip freeze | grep simpletransformers

In [None]:
import pandas as pd
import numpy as np
import csv
import os
import glob
import logging
import wandb
import pprint

from sklearn.metrics import confusion_matrix, classification_report, precision_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from psutil import virtual_memory

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!git clone https://github.com/krfis/rd23.git

In [None]:
def get_files(path):
  """Getting dataframes from path

  Returns: a dictionary of dataframes {name : dataframe}
  """

  tsv_files = glob.glob(os.path.join(path, '*.tsv'))
  dataframes = {}

  for path in tsv_files:

    file_name = os.path.splitext(os.path.basename(path))[0]
    df = pd.read_csv(path, sep="\t")
    column_order = ["text", "sentiment", "lang", "num_tokens"]
    df = df[column_order]
    dataframes[file_name] = df

    print(f'Writing content of {file_name}...')

  return dataframes

In [None]:
folder_path = "/content/rd23/final"

In [None]:
all_dfs = get_files(folder_path)  # dict of all dfs

In [None]:
all_dfs.keys()  # available data sets

# Data sets

In [None]:
k2_train = all_dfs["k2_train"]
k4_train = all_dfs["k4_train"]
k8_train = all_dfs["k8_train"]
k16_train = all_dfs["k16_train"]
k32_train = all_dfs["k32_train"]
few_test_only = all_dfs["few_test_only"]
est_dev = all_dfs["dev_est"]
est_test = all_dfs["test_est"]
zero_train = all_dfs["fs_balanced"]
evc_balanced = all_dfs["evc_balanced"]

In [None]:
# for baseline

base_train, base_test_all = train_test_split(evc_balanced, test_size=0.2, random_state=1)  # train-test split for baseline
base_dev, base_test = train_test_split(base_test_all, test_size=0.5, random_state=1)  # test-dev split for baseline

In [None]:
# for zero-shot and few-shot

few_dev, few_test = train_test_split(few_test_only, test_size=0.5, random_state=1)

In [None]:
k2_train = shuffle(k2_train)
k4_train = shuffle(k4_train)
k8_train = shuffle(k8_train)
k16_train = shuffle(k16_train)
k32_train = shuffle(k32_train)
base_train = shuffle(base_train)
base_dev = shuffle(base_dev)
base_test = shuffle(base_test)
few_dev = shuffle(few_dev)
few_test = shuffle(few_test)

# Model

In [None]:
sweep_config = {
    "method" : "grid",
    "parameters" : {
        "num_train_epochs" : {"values" : [2, 3, 4]},
        "learning_rate" : {"values" : [5e-5, 3e-5, 2e-5]},
    },
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="rd23-final-runs")

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
diff = ['õ', 'ô', 'ā', '·', 'š', 'е', '§', '|', 'а', '́', 'Š', 'ò', 'о', 'Õ', 'à', 'Ü', '}', '•', 'д', '~', 'ž', '°', '−', '\\', '{', 'и', 'ó', 'á']  # characters not in FinBERT

In [None]:
# fixed hyperparameters

train_args = {
    "reprocess_input_data" : True,
    "overwrite_output_dir" : True,
    "evaluate_during_training" : True,
    "evaluate_during_training_silent" : False,
    "evaluate_during_training_steps" : -1,
    "save_eval_checkpoints" : False,
    "save_model_every_epoch" : False,
    "manual_seed" : 4,
    "use_multiprocessing" : True,
    "multiprocessing_chunksize" : 5000,
    "no_cache" : True,
    "train_custom_parameters_only" : False,
    "fp16" : False,
    "train_batch_size" : 16,
    "eval_batch_size" : 16,
    "max_seq_length" : 128,
    "labels_list" : ["negative", "positive"],
    "wandb_project" : "rd23-final-runs",
    "skip_special_tokens" : False,
    "special_tokens_list" : diff
}

In [None]:
print(f'Hyperparameter search: {sweep_config}')

## Training and model selection

In [None]:
def train():

  wandb.init()

  model = ClassificationModel(
    "bert", "TurkuNLP/bert-base-finnish-cased-v1",
    num_labels = 2,
    args = train_args,
    use_cuda = True,
    sweep_config=wandb.config
)

  model.train_model(k32_train, eval_df=est_dev, dev_accuracy=lambda truth, predictions: accuracy_score(
      truth, [round(p) for p in predictions]))

  wandb.join()

In [None]:
wandb.agent(sweep_id, function=train)  # train and sync with wandb

## Testing

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(est_test, verbose=True,
                                                                 test_accuracy=lambda truth, predictions: accuracy_score(truth, [round(p) for p in predictions]),
                                                                 f1_score=lambda truth, predictions: f1_score(truth, [round(p) for p in predictions]),
                                                                 precision_score=lambda truth, predictions: precision_score(truth, [round(p) for p in predictions])
                                                                 )