In [1]:
import sys
import logging
import os.path as p
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import print_n_log

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data.dataloader import DataLoader
from definitions import *
from model_helper_functions import *
from dataset_helper_functions import *
from bi_lstm import BiLSTM
from bert_embedding_model import BertEmbeddingModel
from debates_dataset import DebatesDataset
from early_stopping import EarlyStopping
from optuna.trial import TrialState
from torchvision import transforms
# my transforms
from transforms import *

In [2]:
data = {}
optim_path = os.path.join(EXP_DIR_PATH, 'bi-lstm', 'optimization')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_uw_ratio = 0
dataset_frac = 0.2
worthy_frac = 0.2
rs = 22

Function for loading data.

In [3]:
def load_data():
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [
            p.join(dev_path, 'dev.tsv'),
        ],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
        ],
        'val': [
            p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        ],
    }

    for dtype, dpaths in data_paths.items():
        try:
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

        except Exception as e:
            print(e.args)
            exit()

Datasets and DataLoaders, takes trial as input to be able to suggest values for variables.

In [4]:
def get_loaders(trial):
    global train_uw_ratio
    # dev_df, test_df, train_df, val_df = data.values()
    subsets = {}
    for k, df in data.items():

        n_subset = int(len(df)*dataset_frac)

        worthy_df = df.loc[df['label'] == 1]
        n_worthy = min(int(n_subset*worthy_frac), len(worthy_df))
        worthy_df = worthy_df.sample(n=n_worthy, random_state=rs)

        unworthy_df = df.loc[df['label'] == 0].sample(
            n=n_subset-n_worthy,
            random_state=rs
        )
        if k == 'train':
            train_uw_ratio = len(unworthy_df) / len(worthy_df)
        # sample(frac=1.0) -> shuffle
        subsets[k] = worthy_df.append(unworthy_df).sample(frac=1.0, random_state=rs, ignore_index=True)
    


    # transform_pipeline = transforms.Compose([
    #     Sum('pos', stopwords='wostop'),
    #     Sum('tag', stopwords='wostop'),
    #     ToBinary(6),
    #     ToTensor()
    # ])
    transform_pipeline = None

    train_dd = DebatesDataset(data=subsets['train'], transform=transform_pipeline)
    val_dd = DebatesDataset(data=subsets['val'], transform=transform_pipeline)
    test_dd = DebatesDataset(data=subsets['test'], transform=transform_pipeline)

    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])

    train_loader = DataLoader(train_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dd, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_loader, val_loader, test_loader

Model setup + training loop

In [5]:
def objective(trial):
    global logf_path
    train_loader, val_loader, test_loader = get_loaders(trial)

    # hyperparams opt
    # pooling_strategy = trial.suggest_categorical('pooling_strategy', ['last_four', 'last_four_sum', 'second_last'])
    # should_scale_emb = trial.suggest_categorical('should_scale_emb', [True, False])
    pooling_strategy = 'second_last'
    should_scale_emb = False
    embedding_model = BertEmbeddingModel(
        device=device,
        pooling_strat=pooling_strategy,
        scale=False
    )
    
    # dropout = trial.suggest_float('dropout', 0.0, 0.5, step=0.01)
    # lstm dropout only works with multiple lstm layers
    # lstm_dropout = trial.suggest_float('lstm_dropout', 0.0, 0.3, step=0.05)
    # hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256])
    dropout = 0.4
    lstm_dropout = 0.0
    hidden_dim = 128
    model = BiLSTM(
        dropout=dropout,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_model.dim,
        sent_level_feature_dim=0
    ).to(device)    

    # lr = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    # opt_weight_decay = trial.suggest_float('optimizer_weigth_decay', 0, 0.1, log=True)
    lr = 0.001
    opt_weight_decay = 0
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)

    # pos_weight = trial.suggest_categorical('pos_weight', [1.0, train_uw_ratio])
    pos_weight = train_uw_ratio
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))

    n_epochs = 15 # 20
    threshold = 0.5
    early_stopping = EarlyStopping(
        patience=5,
        path=None,
        verbose=False,
        trace_func=print_n_log.run('early_stopping', logf_path, 'DEBUG')
    )
    
    # training
    for epoch in range(n_epochs):
        losses, val_losses = [], []

        model.train()
        for ids, sentences, labels, features in train_loader:
            labels = labels.float().to(device)
            
            embeddings, lengths = embedding_model(sentences)
            output = model(embeddings, lengths)
            loss = criterion(output, labels)
            
            loss.backward()
            losses.append(loss.item())

            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        y_pred, y_true = [], []
        with torch.no_grad():
            for val_ids, val_sentences, val_labels, val_features in val_loader:
                val_labels = val_labels.float().to(device)
                
                val_embeddings, val_lengths = embedding_model(val_sentences)
                pred = model(val_embeddings, val_lengths)
                val_loss = criterion(pred, val_labels)
                val_losses.append(val_loss.item())
                
                pred = torch.sigmoid(pred)
                
                pred = (pred > threshold).int()
                y_pred.extend(pred.tolist())
                y_true.extend(val_labels.tolist())
        
#         print('epoch: ', epoch)
#         print('avg train loss: ', sum(losses) / len(losses))
#         print('avg val loss: ', sum(val_losses) / len(val_losses))
#         print(classification_report(y_true, y_pred, digits=6))
        cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
        
        val_loss = np.average(val_losses)
        early_stopping(val_loss, model, acomp_metrics={'recall_p': cr['1.0']['recall']})
        
        if early_stopping.early_stop:
            break

#         trial.report(recall_p, epoch)

#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()
    recall_p = early_stopping.acomp_metrics['recall_p'] if early_stopping.acomp_metrics else 0.0
    "Done."
    return recall_p

In [6]:
load_data()

In [None]:
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# needed for GridSampler
search_space = {
    'batch_size': [16, 32, 64],
    'pooling_strategy': ['last_four', 'last_four_sum', 'second_last'],
#     'should_scale_emb': [False, True],
    'dropout': [i/100 for i in range(0, 51, 5)],
    'hidden_dim': [128, 256, 512],
    'optimizer_weigth_decay': [i/10000 for i in range(11)],
    'learning_rate': round_to_first_non_zero([i/100000 for i in range_inc(0, 100000, 1, 10)]),
    'pos_weight': [True, False]
}
# print(search_space)

study = optuna.create_study(
    study_name=f'bi-lstm_sGrid_pNone_df{dataset_frac}_wf{worthy_frac}',
    sampler=optuna.samplers.GridSampler(search_space),
#     pruner=optuna.pruners.MedianPruner(),
    direction='maximize'
)
logf_path = p.join(LOG_DIR_PATH, f'{study.study_name}.log')
study.optimize(objective, n_trials=150)

study_path = os.path.join(optim_path, f'{study.study_name}.pkl')
torch.save(study, study_path)

[32m[I 2022-03-13 12:14:41,193][0m A new study created in memory with name: bi-lstm_sGrid_pNone_df0.2_wf0.2[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[32m[I 2022-03-13 12:17:40,286][0m Tria

In [None]:
loaded_study = torch.load(study_path)

print(loaded_study.best_trial.params)