In [1]:
import sys
import logging
import os.path as p
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import print_n_log

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data.dataloader import DataLoader
from definitions import *
from model_helper_functions import *
from dataset_helper_functions import *
from bi_lstm import BiLSTM
from feature_nn import FeatureNN
from pair_nn import PairNN
from bert_embedding_model import BertEmbeddingModel
from debates_dataset import DebatesDataset
from early_stopping import EarlyStopping
from optuna.trial import TrialState
from torchvision import transforms
# my transforms
from transforms import *

In [2]:
data = {}
optim_path = os.path.join(EXP_DIR_PATH, 'bi-lstm', 'optimization')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_uw_ratio = 0
dataset_frac = 0.2
worthy_frac = 0.03
slf_dim = 0
# stopwords_type = 'wstop'
rs = 22

Function for loading data.

In [3]:
def load_data():
    global data
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [
            p.join(dev_path, 'dev.tsv'),
        ],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
        ],
        'val': [
            p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        ],
    }

    for dtype, dpaths in data_paths.items():
        try:
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

        except Exception as e:
            print(e.args)
            exit()



In [4]:
# stopwords_type = 'wostop'
# from_sel = True
# transform_pipeline = transforms.Compose([
#     HandleStopwords(stopwords=stopwords_type),
#     NoTransform('pos', from_selection=from_sel, stopwords=stopwords_type),
#     OneHot('tag', from_selection=from_sel, stopwords=stopwords_type),
#     NoTransform(),
#     ToBinary(6),
#     ToTensor()
# ])
# dd = DebatesDataset(data=filter_by_length(data['train']), transform=transform_pipeline)
# # dd = pad_features(dd)
# # for ids, content, label, feature in dd:
# #     print(content)
# #     print(feature.size())
# dl = DataLoader(dd, batch_size=32, shuffle=True, drop_last=True)

# for ids, sentences, labels, features in dl:
# #     print(features.size())
#     pass
# print('done')

Datasets and DataLoaders, takes trial as input to be able to suggest values for variables.

In [5]:
def get_loaders(trial, batch_size, stopwords_type, dep_feature):
    global train_uw_ratio, slf_dim
    # dev_df, test_df, train_df, val_df = data.values()
    subsets = {}
    for k, df in data.items():

        n_subset = int(len(df)*dataset_frac)

        worthy_df = df.loc[df['label'] == 1]
        n_worthy = min(int(n_subset*worthy_frac), len(worthy_df))
        worthy_df = worthy_df.sample(n=n_worthy, random_state=rs)

        unworthy_df = df.loc[df['label'] == 0].sample(
            n=n_subset-n_worthy,
            random_state=rs
        )
        if k == 'train':
            train_uw_ratio = len(unworthy_df) / len(worthy_df)
        # sample(frac=1.0) -> shuffle
        subsets[k] = worthy_df.append(unworthy_df).sample(frac=1.0, random_state=rs, ignore_index=True)
    

#     transforms_map = {
#         'sum': Sum,
#         'onehot': OneHot,
#         'none': NoTransform
#     }
# #     transforms_options = list(transforms_map.keys())
#     cw_map = {
#         'count_words': CountWords,
#         'none': NoTransform
#     }
#     cw_options = list(cw_map.keys())

#     from_sel = trial.suggest_categorical('from_selection', [True, False])

#     trial.suggest_categorical returns one of the keys of transforms_map, which then return Transform or None
#     if transform --> initialize
#     pos_feat = transforms_map[trial.suggest_categorical('pos_feature_type', transforms_options)]
#     pos_feat = pos_feat(
#         'pos', from_selection=from_sel, stopwords=stopwords_type
#     )
    # feature combination optim
#     pos_feat = trial.suggest_categorical('pos_feature', [True, False])
#     pos_feat = Sum('pos', from_selection=feature_params['from_selection'], stopwords=stopwords_type) if pos_feat else NoTransform()
    # param optim after feat combination optim
#     pos_feat = transforms_map[feature_params['pos_feature']]
#     pos_feat = pos_feat(
#         'pos',
#         from_selection=feature_params['from_selection'],
#         stopwords=stopwords_type
#     )
#     tag_feat = transforms_map[trial.suggest_categorical('tag_feature_type', transforms_options)]
#     tag_feat = tag_feat(
#         'tag', from_selection=from_sel, stopwords=stopwords_type
#     )
    # feature combination optim
#     tag_feat = trial.suggest_categorical('tag_feature', [True, False])
#     tag_feat = Sum('tag', from_selection=feature_params['from_selection'], stopwords=stopwords_type) if tag_feat else NoTransform()
    # param optim after feat combination optim
#     tag_feat = transforms_map[feature_params['tag_feature']]
#     tag_feat = tag_feat(
#         'tag',
#         from_selection=feature_params['from_selection'],
#         stopwords=stopwords_type
#     )
#     dep_feat = transforms_map[trial.suggest_categorical('dep_feature_type', transforms_options)]
#     dep_feat = dep_feat(
#         'dep', from_selection=from_sel, stopwords=stopwords_type
#     )
    # param optim after feat combination optim
#     dep_feat = transforms_map[feature_params['dep_feature_type']]
#     dep_feat = dep_feat(
#         'dep',
#         from_selection=feature_params['from_selection'],
#         stopwords=stopwords_type
#     )

# #     cw_feat = cw_map[trial.suggest_categorical('word_count_feature_type', cw_options)]
# #     cw_feat = cw_feat()

#     # feature combination opt
# #     dep_feat = OneHot('dep', from_selection=feature_params['from_selection'], stopwords=stopwords_type) if dep_feature == 'sent_level' else NoTransform()
#     cw_feat = cw_map[feature_params['word_count_feature_type']]()
    
#     transform_pipeline = transforms.Compose([
#         HandleStopwords(stopwords=stopwords_type),
# #         pos_feat,
# #         tag_feat,
#         dep_feat,
#         cw_feat,
#         ToBinary(6),
#         ToTensor()
#     ])
    transform_pipeline = None

    train_dd = DebatesDataset(data=subsets['train'], transform=transform_pipeline)
    val_dd = DebatesDataset(data=subsets['val'], transform=transform_pipeline)
    test_dd = DebatesDataset(data=subsets['test'], transform=transform_pipeline)

    if transform_pipeline is not None:
        slf_dim = train_dd[0][-1].size()[0] if torch.is_tensor(train_dd[0][-1]) else 0
#     batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])

    train_loader = DataLoader(train_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dd, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_loader, val_loader, test_loader

Model setup + training loop

In [6]:
def objective(trial):
    global logf_path
    # this is here so that it can be accessed here and in get_loaders()
#     dep_feature = trial.suggest_categorical('dep_feature', ['word_level', 'sent_level'])
    dep_feature = None
#     stopwords_type = trial.suggest_categorical('stopwords_type', ['wstop', 'wostop'])
#     stopwords_type = 'wstop'
#     stopwords_type = feature_params['stopwords_type']
    stopwords_type = None
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])

#     batch_size = params['batch_size']
#     batch_size = None
    train_loader, val_loader, test_loader = get_loaders(trial, batch_size, stopwords_type, dep_feature)

    # hyperparams opt
    pooling_strategy = trial.suggest_categorical('pooling_strategy', ['last_four', 'last_four_sum', 'second_last'])
    dropout = trial.suggest_float('dropout', 0.0, 0.5, step=0.01)
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 512])
    w_seq = trial.suggest_categorical('with_sequential_layer', [True, False])
    lr = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    opt_weight_decay = trial.suggest_float('optimizer_weigth_decay', 1e-6, 0.1, log=True)
    pos_weight = trial.suggest_categorical('pos_weight', [1.0, train_uw_ratio])
    
#     fnn_hidden_dim = trial.suggest_categorical('fnn_hidden_dim', [128, 256, 512])
#     fnn_n_layers = trial.suggest_int('fnn_n_hidden_layers', 1, 6)
#     fnn_dropout = trial.suggest_float('fnn_dropout', 0.0, 0.5, step=0.01)
    
#     pnn_hidden_dim = trial.suggest_categorical('pnn_hidden_dim', [8, 16, 32, 64])
#     pnn_dropout = trial.suggest_float('pnn_dropout', 0.0, 0.5, step=0.01)
    
    # temp_best
#     pooling_strategy = params['pooling_strategy']
#     dropout = params['dropout']
#     hidden_dim = params['hidden_dim']
#     w_seq = params['with_sequential_layer']
#     lr = params['learning_rate']
#     opt_weight_decay = params['optimizer_weigth_decay']
#     pos_weight = train_uw_ratio if params['pos_weight'] > 1.0 else 1.0

#     fnn_hidden_dim = params['fnn_hidden_dim']
#     fnn_n_layers = params['fnn_n_hidden_layers']
#     fnn_dropout = params['fnn_dropout']
    
#     pnn_hidden_dim = params['pnn_hidden_dim']
#     pnn_dropout = params['pnn_dropout']
    
    
    # should_scale_emb = trial.suggest_categorical('should_scale_emb', [True, False])
#     pooling_strategy = 'second_last'

    # TODO: test these as well
    # remove_stopwords = stopwords_type != 'wstop'
    # dep_feat = trial.suggest_categorical('word_level_dep_features', [True, False])
    # triplet_feat = trial.suggest_categorical('word_level_triplet_features', [True, False])
#     word_level_feat = trial.suggest_categorical('word_level_feature_type', ['dep', 'triplet'])
    embedding_model = BertEmbeddingModel(
        device=device,
        pooling_strat=pooling_strategy,
        scale=False,
        dep_features=False, #dep_feature == 'word_level', #False, #word_level_feat == 'dep',
        triplet_features=False #word_level_feat == 'triplet',
    )
    
#     lstm dropout only works with multiple lstm layers
#     lstm_dropout = trial.suggest_float('lstm_dropout', 0.0, 0.3, step=0.05)
    model = BiLSTM(
        dropout=dropout,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_model.dim,
        sent_level_feature_dim=slf_dim, #if dep_feature == 'sent_level' else 0,
        device=device,
        w_seq=w_seq,
    ).to(device)
    
#     feature_model = FeatureNN(
#         feature_dim=slf_dim,
#         hidden_dim=fnn_hidden_dim,
#         n_hidden_layers=fnn_n_layers,
#         dropout=fnn_dropout
#     ).to(device)
    
#     pair_model = PairNN(
#         model, feature_model,
#         hidden_dim=pnn_hidden_dim,
#         dropout=pnn_dropout
#     ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)

#     criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
    criterion = nn.SmoothL1Loss(reduction='sum')

    n_epochs = 20
    threshold = 0.5
    early_stopping = EarlyStopping(
        patience=6,
        path=None,
        verbose=False,
        trace_func=print_n_log.run('early_stopping', logf_path, 'DEBUG')
    )
    
    train_losses, val_losses = [], []
    # training
    for epoch in range(n_epochs):
#         print(epoch)
        epoch_train_losses, epoch_val_losses = [], []

        model.train()
        for ids, sentences, labels, features in train_loader:
            labels = labels.float().to(device)
            features = features.to(device)
            
            embeddings, lengths = embedding_model(sentences)
            output = model(embeddings, lengths.cpu(), sent_level_features=features) # if dep_feature == 'sent_level' else None)
            # pair model
#             output = pair_model((embeddings, lengths.cpu()), features)
            loss = criterion(output, labels)
            
            loss.backward()
            epoch_train_losses.append(loss.item())

            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        y_pred, y_true = [], []
        with torch.no_grad():
            for val_ids, val_sentences, val_labels, val_features in val_loader:
                val_labels = val_labels.float().to(device)
                val_features = val_features.to(device)
                
                val_embeddings, val_lengths = embedding_model(val_sentences)
                pred = model(val_embeddings, val_lengths.cpu(), sent_level_features=val_features) #if dep_feature == 'sent_level' else None)
                # pair model
#                 pred = pair_model((val_embeddings, val_lengths.cpu()), val_features)
                val_loss = criterion(pred, val_labels)
                epoch_val_losses.append(val_loss.item())
                
                pred = torch.sigmoid(pred)
                
                pred = (pred > threshold).int()
                y_pred.extend(pred.tolist())
                y_true.extend(val_labels.tolist())
        
        val_losses.append(np.average(epoch_val_losses))
        train_losses.append(np.average(epoch_train_losses))
        avg_val_loss = np.average(val_losses)
        
        cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
        
        val_loss = np.average(val_losses)
        early_stopping(avg_val_loss, model, acomp_metrics={'f1_p': cr['1.0']['f1-score']})
        
        if early_stopping.early_stop:
            break

#         trial.report(recall_p, epoch)

#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()
    metric = early_stopping.acomp_metrics['f1_p'] if early_stopping.acomp_metrics else 0.0
    "Done."
    return metric

In [7]:
load_data()

In [None]:
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# needed for GridSampler
# feature_search_space = {
#     'stopwords_type': ['wstop', 'wostop'],
#     'from_selection': [True, False],
#     'pos_feature_type': ['sum', 'onehot', 'none'],
# #     'tag_feature_type': ['sum', 'onehot', 'none'],
# #     'dep_feature_type': ['sum', 'onehot', 'none'],
#     'word_count_feature_type': ['count_words', 'none'],
# #     'word_level_feature_type': ['dep', 'triplet']
# }
# featOptimSentFeat
# feature_combination_search_space = {
#     'dep_feature': ['word_level', 'sent_level'],
# }
# feature_params = {
#     'stopwords_type': 'wstop',
#     'from_selection': True,
#     'word_count_feature_type': 'count_words'
# }
# featOptimFeatModel
# feature_combination_search_space = {
#     'pos_feature': [True, False],
#     'tag_feature': [True, False]
# }
# feature_params = {
#     'stopwords_type': 'wstop',
#     'from_selection': False,
#     'word_count_feature_type': 'none',
#     # after featOptimFeatModel
#     'pos_feature': 'sum',
#     'tag_feature': 'sum'
# }

# after featOptimSentFeat
# feature_params = {
#     'stopwords_type': 'wostop',
#     'from_selection': True,
#     'dep_feature_type': 'onehot',
#     'word_count_feature_type': 'count_words'
# }


# study_name=f'bi-lstm_featOptim_wAtt_sTPE_pNone_df{dataset_frac}_wf{worthy_frac}'
# params = {'batch_size': 64, 'pooling_strategy': 'second_last', 'dropout': 0.18, 'hidden_dim': 256, 'with_sequential_layer': False, 'learning_rate': 0.00040734193646848514, 'optimizer_weigth_decay': 4.3306219098713085e-05, 'pos_weight': 32.81159420289855}
# params = {'batch_size': 32, 'pooling_strategy': 'last_four_sum', 'dropout': 0.01, 'hidden_dim': 128, 'with_sequential_layer': False, 'learning_rate': 4.8949721500587447e-05, 'optimizer_weigth_decay': 1.0859798578698863e-05, 'pos_weight': 32.81159420289855}
# params = { 'batch_size': 16, 'pooling_strategy': 'last_four', 'dropout': 0.05, 'hidden_dim': 128, 'with_sequential_layer': False, 'learning_rate': 1.0691418898885528e-05, 'optimizer_weigth_decay': 1.0929364050970057e-05, 'pos_weight': 32.81159420289855, 'fnn_hidden_dim': 256, 'fnn_n_hidden_layers': 4, 'fnn_dropout': 0.12, 'pnn_hidden_dim': 8, 'pnn_dropout': 0.24 }
study = optuna.create_study(
    study_name=f'bi-lstm_NO_FEAT_mF1_wAtt_sTPE_pNone_df{dataset_frac}_wf{worthy_frac}',
    sampler=optuna.samplers.TPESampler(),
#     sampler=optuna.samplers.GridSampler(feature_combination_search_space),
#     pruner=optuna.pruners.MedianPruner(),
    direction='maximize'
)
logf_path = p.join(LOG_DIR_PATH, f'{study.study_name}.log')
study.optimize(objective, n_trials=100)

study_path = os.path.join(optim_path, f'{study.study_name}.pkl')
torch.save(study, study_path)
# torch.save(params, f'{os.path.join(optim_path, study.study_name)}_params.pkl')
# torch.save(feature_params, f'{os.path.join(optim_path, study.study_name)}_featureParams.pkl')

In [None]:
loaded_study = torch.load(study_path)

print(loaded_study.best_trial.params)
print(loaded_study.best_trial)