# Model Training

Try out different architecture and use raytune to select best hyperparameters.

### 1. Import Packages and check versions

In [1]:
# import own scripts
import src.classifier as classifier
import src.preprocess_data as prepData
import src.hyperparameter_tuning as hyperTune
#import src.tester as tester

In [2]:
# update own scripts
from importlib import reload
reload(classifier)
reload(prepData)
reload(hyperTune)

<module 'src.hyperparameter_tuning' from 'D:\\Dokumente\\2_Bildung\\2_MSc\\1_Classes\\Y2T2_NLP\\3_assignment\\NLP-Aspect-Term-Polarity-Classification\\src\\hyperparameter_tuning.py'>

In [3]:
# basic stuff
import os
import numpy as np

# data handling
import datasets
from datasets import Dataset, DatasetDict
import pandas as pd

# pre-processing
import nltk
import sentencepiece
import stanza

# modeling
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModel, DataCollatorWithPadding, get_scheduler

# evaluation
import sklearn
from sklearn.metrics import accuracy_score

# hyperparam optimization
from ray import air, tune

# visualize
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

In [4]:
# check versions
print(f"pytorch: {torch.__version__} / allowed: 1.13.1")
print(f"pytorch-lightning: {pl.__version__} / allowed: 1.8.1")
print(f"transformers: {transformers.__version__} / allowed: 4.22.2")
print(f"datasets: {datasets.__version__} / allowed: 2.9.0")
print(f"sentencepiece: {sentencepiece.__version__} / allowed: 0.1.97")
print(f"scikit-learn: {sklearn.__version__} / allowed: 1.2.0")
print(f"numpy: {np.__version__} / allowed: 1.23.5")
print(f"pandas: {pd.__version__} / allowed: 1.5.3")
print(f"nltk: {nltk.__version__} / allowed: 3.8.1")
print(f"stanza: {stanza.__version__} / allowed: 1.4.2")

pytorch: 1.13.1+cu117 / allowed: 1.13.1
pytorch-lightning: 1.8.1 / allowed: 1.8.1
transformers: 4.22.2 / allowed: 4.22.2
datasets: 2.9.0 / allowed: 2.9.0
sentencepiece: 0.1.97 / allowed: 0.1.97
scikit-learn: 1.2.0 / allowed: 1.2.0
numpy: 1.23.5 / allowed: 1.23.5
pandas: 1.5.3 / allowed: 1.5.3
nltk: 3.8.1 / allowed: 3.8.1
stanza: 1.4.2 / allowed: 1.4.2


In [5]:
# where to save trial results to
ray_path = os.path.abspath("")+"\\ray_results\\"
if not os.path.isdir(ray_path):
    os.mkdir(ray_path)

# ignore warnings that show in every raytune run
import warnings
warnings.simplefilter(action = "ignore", category = np.VisibleDeprecationWarning)

### 2. BERT Model -- Hyperparameter Selection

In [6]:
# tunable hyperparameter search space --> search using tune.choice([]), tune.uniform(lower, upper), tune.grid_search([])
config = {
    # basic infos
    "data_path": os.path.abspath(""),
    "max_epochs": 10,
    "batch_size": 20,
    
    # data preprocessing
    "input_enrichment": tune.grid_search(["aspect_sentence", "aspect_target_sentence"]),
    
    # pre-trained language model (transformer)
    "plm_name": "bert-base-cased",
    "plm_freeze": False, # freezing weights doesn't make sense (at least with BERT)

    # classifier (linear layers)
    "cls_channels":   tune.grid_search([[3], [1000, 3], [100, 1000, 3]]),
    "cls_activation": "ReLU", # tune.grid_search(["ReLU", "Sigmoid", "Tanh"])
    "cls_dropout":    tune.grid_search([0, 0.2]),
    
    # optimizer
    "lr": 5e-5, # tune.grid_search([1e-4, 1e-5, 1e-6])
    "wd": 1e-2, # tune.grid_search([1e-2, 1e-3, 1e-4])

    # scheduler
    "lr_s": "cosine", # tune.grid_search(["constant", "linear", "cosine"])
    "warmup": 0, # number of epochs to warm up learning rate
    
    # loss function
    "criterion": "BCE"
}

In [7]:
# how many trials to run (if grid_search utilized, it will run this number per grid_search value)
num_samples = 1

# run experiment
result_grid = hyperTune.run_ray_experiment(
    hyperTune.ray_trainable, config, ray_path, num_samples,
    metric_columns = ["trn_acc", "dev_acc", "training_iteration"],
    parameter_columns = ["input_enrichment", "plm_freeze", "cls_channels", "cls_dropout"]
)

0,1
Current time:,2023-03-17 12:59:04
Running for:,00:21:37.09
Memory:,14.2/31.9 GiB

Trial name,status,loc,input_enrichmen t,plm_freeze,cls_channels,cls_dropout,trn_acc,dev_acc,training_iterat ion
183fb_00000,TERMINATED,127.0.0.1:16940,aspect_sentence,False,[3],0.0,0.981371,0.821809,10
183fb_00001,TERMINATED,127.0.0.1:16940,aspect_sentence,False,"[1000, 3]",0.0,0.973387,0.803191,10
183fb_00002,TERMINATED,127.0.0.1:16940,aspect_sentence,False,"[100, 1000, 3]",0.0,0.956088,0.824468,10
183fb_00003,TERMINATED,127.0.0.1:16940,aspect_sentence,False,[3],0.2,0.973387,0.819149,10
183fb_00004,TERMINATED,127.0.0.1:16940,aspect_sentence,False,"[1000, 3]",0.2,0.969395,0.832447,10
183fb_00005,TERMINATED,127.0.0.1:16940,aspect_sentence,False,"[100, 1000, 3]",0.2,0.939454,0.821809,10
183fb_00006,TERMINATED,127.0.0.1:16940,aspect_tar_e920,False,[3],0.0,0.997339,0.840426,10
183fb_00007,TERMINATED,127.0.0.1:16940,aspect_tar_e920,False,"[1000, 3]",0.0,0.986693,0.829787,10
183fb_00008,TERMINATED,127.0.0.1:16940,aspect_tar_e920,False,"[100, 1000, 3]",0.0,0.978709,0.821809,10
183fb_00009,TERMINATED,127.0.0.1:16940,aspect_tar_e920,False,[3],0.2,0.996008,0.824468,10


2023-03-17 12:37:26,498	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-03-17 12:59:04,938	INFO tune.py:798 -- Total run time: 1297.12 seconds (1297.08 seconds for the tuning loop).


In [11]:
# get best score per trial (highest validation accuracy)
N = 12
best_result_df = result_grid.get_dataframe(
    filter_metric="dev_acc", filter_mode="max"
)
best_result_df = best_result_df[["trial_id", "training_iteration",
                                 "config/input_enrichment", "config/plm_name",
                                 "config/cls_channels", "config/cls_dropout",
                                 "trn_loss", "dev_loss", "trn_acc", "dev_acc"]]
best_result_df = best_result_df.sort_values(by=["dev_acc"], ascending = False)

if len(result_grid) > N:
    best_result_df = best_result_df.head(N)

best_result_df

Unnamed: 0,trial_id,training_iteration,config/input_enrichment,config/plm_name,config/cls_channels,config/cls_dropout,trn_loss,dev_loss,trn_acc,dev_acc
7,183fb_00007,2,aspect_target_sentence,bert-base-cased,"[1000, 3]",0.0,0.217705,0.296243,0.872921,0.851064
6,183fb_00006,7,aspect_target_sentence,bert-base-cased,[3],0.0,0.031805,0.447852,0.983367,0.848404
4,183fb_00004,1,aspect_sentence,bert-base-cased,"[1000, 3]",0.2,0.340267,0.265931,0.785096,0.845745
0,183fb_00000,1,aspect_sentence,bert-base-cased,[3],0.0,0.315254,0.266852,0.803061,0.837766
8,183fb_00008,4,aspect_target_sentence,bert-base-cased,"[100, 1000, 3]",0.0,0.145645,0.348905,0.918164,0.837766
9,183fb_00009,1,aspect_target_sentence,bert-base-cased,[3],0.2,0.336174,0.258009,0.789088,0.837766
10,183fb_00010,7,aspect_target_sentence,bert-base-cased,"[1000, 3]",0.2,0.054733,0.444644,0.972056,0.835106
2,183fb_00002,3,aspect_sentence,bert-base-cased,"[100, 1000, 3]",0.0,0.217635,0.30461,0.896208,0.832447
1,183fb_00001,1,aspect_sentence,bert-base-cased,"[1000, 3]",0.0,0.34316,0.283561,0.786427,0.829787
11,183fb_00011,2,aspect_target_sentence,bert-base-cased,"[100, 1000, 3]",0.2,0.303456,0.316796,0.825017,0.829787
