In [1]:
import sys
import logging
import os.path as p
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import print_n_log

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data.dataloader import DataLoader
from definitions import *
from model_helper_functions import *
from dataset_helper_functions import *
from bi_lstm import BiLSTM
from bert_embedding_model import BertEmbeddingModel
from debates_dataset import DebatesDataset
from early_stopping import EarlyStopping
from optuna.trial import TrialState
from torchvision import transforms
# my transforms
from transforms import *

In [2]:
data = {}
optim_path = os.path.join(EXP_DIR_PATH, 'bi-lstm', 'optimization')
training_path = os.path.join(EXP_DIR_PATH, 'bi-lstm', 'training')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_uw_ratio = 0
slf_dim = 0


In [3]:
studies = [m for m in os.listdir(optim_path) if m.split('_')[-1] != 'params.pkl']
models_directories = []
for m in studies:
    sp = m.split('_')[1:]
    xx = '_'.join([s for s in sp if s not in {'pNone', 'df0.2', 'wf0.2.pkl'}])
    if len(xx):
        models_directories.append(xx)

for c in models_directories:
    try:
        os.mkdir(os.path.join(training_path, c))
    except Exception as e:
        print(e.args)

(17, 'File exists')
(17, 'File exists')
(17, 'File exists')
(17, 'File exists')
(17, 'File exists')


Function for loading data.

In [4]:
def load_data():
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [
            p.join(dev_path, 'dev.tsv'),
        ],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
        ],
        'val': [
            p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        ],
    }

    for dtype, dpaths in data_paths.items():
        try:
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

        except Exception as e:
            print(e.args)
            exit()

Datasets and DataLoaders, takes trial as input to be able to suggest values for variables.

In [5]:
def get_loaders(batch_size, transforms_params=None, stopwords_type=None):
    global train_uw_ratio, slf_dim

    transform_pipeline = None

    if transforms_params:
        transforms_map = {
            'sum': Sum,
            'onehot': OneHot,
            'none': NoTransform
        }
        cw_map = {
            'count_words': CountWords,
            'none': NoTransform
        }

        from_sel = transforms_params['from_selection']
  
        pos_feat = transforms_map[transforms_params['pos_feature_type']]
        pos_feat = pos_feat(
            'pos', from_selection=from_sel, stopwords=stopwords_type
        )

        tag_feat = transforms_map[transforms_params['tag_feature_type']]
        tag_feat = tag_feat(
            'tag', from_selection=from_sel, stopwords=stopwords_type
        )

        cw_feat = cw_map[transforms_params['word_count_feature_type']]
        cw_feat = cw_feat()
        
        transform_pipeline = transforms.Compose([
            HandleStopwords(stopwords=stopwords_type),
            pos_feat,
            tag_feat,
            cw_feat,
            ToBinary(6),
            ToTensor()
        ])

    train = data['train']
    worthy_train = train[train['label'] == 1]
    train_uw_ratio = (len(train) - len(worthy_train)) / len(worthy_train)
    
    train_dd = DebatesDataset(data=data['train'], transform=transform_pipeline)
    val_dd = DebatesDataset(data=data['val'], transform=transform_pipeline)
    test_dd = DebatesDataset(data=data['test'], transform=transform_pipeline)
    
    if transforms_params:
        slf_dim = train_dd[0][-1].size()[0]

    train_loader = DataLoader(train_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dd, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_loader, val_loader, test_loader

Model setup + training loop

In [6]:
def train_model(params, model_checkpoint_path):
    global logf_path
    # this is here so that it can be accessed here and in get_loaders()
    stopwords_type = params['stopwords_type'] if 'stopwords_type' in params else None

    train_loader, val_loader, test_loader = get_loaders(
        params['batch_size'],
        transforms_params=None,
        stopwords_type=stopwords_type
    )

    # best for given trial
    pooling_strategy = params['pooling_strategy']
    dropout = params['dropout']
    hidden_dim = params['hidden_dim']
    w_seq = params['with_sequential_layer']
    lr = params['learning_rate']
    opt_weight_decay = params['optimizer_weigth_decay']
    pos_weight = train_uw_ratio if params['pos_weight'] > 1.0 else 1.0
   
    # TODO: test these as well
    # remove_stopwords = stopwords_type != 'wstop'
    # dep_feat = trial.suggest_categorical('word_level_dep_features', [True, False])
    # triplet_feat = trial.suggest_categorical('word_level_triplet_features', [True, False])
#     word_level_feat = trial.suggest_categorical('word_level_feature_type', ['dep', 'triplet'])
    embedding_model = BertEmbeddingModel(
        device=device,
        pooling_strat=pooling_strategy,
        scale=False,
        dep_features=True,#word_level_feat == 'dep',
        triplet_features=True,#word_level_feat == 'triplet',
        remove_stopwords=True,#stopwords_type == 'wostop'
    )

    model = BiLSTM(
        dropout=dropout,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_model.dim,
        sent_level_feature_dim=slf_dim,
        device=device,
        w_seq=w_seq,
    ).to(device)    

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)

    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))

#     early_stopping = EarlyStopping(
#         patience=10,
#         path=model_checkpoint_path,
#         verbose=False,
#         trace_func=print_n_log.run('early_stopping', logf_path, 'DEBUG')
#     )

    n_epochs = 30
    threshold = 0.5
    val_losses, train_losses = [], []

    for epoch in range(n_epochs):
        epoch_val_losses, epoch_train_losses = [], []

        model.train()
        for ids, sentences, labels, features in train_loader:
            labels = labels.float().to(device)
            features = features.to(device)
            
            embeddings, lengths = embedding_model(sentences)
            output = model(embeddings, lengths.cpu(), sent_level_features=features)
            loss = criterion(output, labels)
            
            loss.backward()
            epoch_train_losses.append(loss.item())

            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        y_pred, y_true = [], []
        with torch.no_grad():
            for val_ids, val_sentences, val_labels, val_features in val_loader:
                val_labels = val_labels.float().to(device)
                val_features = val_features.to(device)
                
                val_embeddings, val_lengths = embedding_model(val_sentences)
                pred = model(val_embeddings, val_lengths.cpu(), sent_level_features=val_features)
                val_loss = criterion(pred, val_labels)
                epoch_val_losses.append(val_loss.item())
                
                pred = torch.sigmoid(pred)
                
                pred = (pred > threshold).int()
                y_pred.extend(pred.tolist())
                y_true.extend(val_labels.tolist())
        
        val_losses.append(np.average(epoch_val_losses))
        train_losses.append(np.average(epoch_train_losses))
        avg_val_loss = np.average(val_losses)
        print(
            'epoch ==> ', epoch,
            ' | avg train loss ==> ', np.average(train_losses),
            ' | avg val loss ==> ', avg_val_loss
        )
        print(classification_report(y_true, y_pred, digits=6))
        # cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
        
#         early_stopping(
#             val_loss=avg_val_loss,
#             model=model,
#             optimizer=optimizer,
#             train_losses=train_losses,
#             val_losses=val_losses,
#             # acomp_metrics={'recall_p': cr['1.0']['recall']}
#         )
        
#         if early_stopping.early_stop:
#             break

    # recall_p = early_stopping.acomp_metrics['recall_p'] if early_stopping.acomp_metrics else 0.0
    # "Done."
#     return model, 

In [7]:
def evaluate_model(params, load_path):
    stopwords_type = params['stopwords_type'] if 'stopwords_type' in params else None
    train_loader, val_loader, test_loader = get_loaders(
        params['batch_size'],
        transforms_params=None,
        stopwords_type=stopwords_type
    )

    # best for given trial
    pooling_strategy = params['pooling_strategy']
    dropout = params['dropout']
    hidden_dim = params['hidden_dim']
    w_seq = params['with_sequential_layer']
    lr = params['learning_rate']
    opt_weight_decay = params['optimizer_weigth_decay']
    pos_weight = train_uw_ratio if params['pos_weight'] > 1.0 else 1.0
    
    embedding_model = BertEmbeddingModel(
        device=device,
        pooling_strat=pooling_strategy,
        scale=False,
        dep_features=False,#word_level_feat == 'dep',
        triplet_features=False,#word_level_feat == 'triplet',
        remove_stopwords=False,#stopwords_type == 'wostop'
    )
        
    model = BiLSTM(
        dropout=dropout,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_model.dim,
        sent_level_feature_dim=slf_dim,
        device=device,
        w_seq=w_seq,
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
    
    load_checkpoint(load_path, model, optimizer, device)
    
    threshold = 0.5
    y_pred = []
    y_true = []
    temp = []

    model.eval()
    with torch.no_grad():
        for test_ids, test_sentences, test_labels, test_features in test_loader:           
            test_labels = test_labels.float().to(device)

            embeddings, lengths = embedding_model(test_sentences)
            output = torch.sigmoid(model(embeddings, lengths.cpu()))

            temp.extend(output.tolist())
            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(test_labels.tolist())

    print('Ranking:')
    temp.sort(reverse=True)
    print(temp[:10])

    print('Classification Report:')
    print(classification_report(y_true, y_pred, digits=4)) #

In [8]:
load_data()

In [9]:
# study_path = os.path.join(optim_path, 'bi-lstm_wAtt_sTPE_pNone_df0.2_wf0.2.pkl')
# params_path = os.path.join(optim_path, 'bi-lstm_featOptim_wAtt_sTPE_pNone_df0.2_wf0.2_params.pkl')
# model_checkpoint_path = os.path.join(training_path, 'wAtt_sTPE')


# for now ignore features 
studies = [s for s in studies if '_params' not in s and 'featOptim' not in s]
is_training = True
for study_name in studies[:1]:
    study = torch.load(os.path.join(optim_path, study_name))

    checkpoint_dir = [d for d in models_directories if d in study_name][0]

    params = study.best_params
    logf_path = p.join(LOG_DIR_PATH, f'training_{study.study_name}.log')
    
    if 'with_sequential_layer' not in params:
        params['with_sequential_layer'] = False
    
    if isinstance(params['pos_weight'], bool):
        params['pos_weight'] = 2.0 if params['pos_weight'] else 1.0
    
    checkpoint_path = os.path.join(training_path, checkpoint_dir)
    if is_training:
        train_model(params, checkpoint_path)
    else:
        evaluate_model(params, checkpoint_path)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ValueError: expected sequence of length 65 at dim 1 (got 47)

In [None]:
# TODO: this is kept here just in case
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# needed for GridSampler
# search_space = {
#     'batch_size': [16, 32, 64],
#     'pooling_strategy': ['last_four', 'last_four_sum', 'second_last'],
# #     'should_scale_emb': [False, True],
#     'dropout': [i/100 for i in range(0, 51, 5)],
#     'hidden_dim': [128, 256, 512],
#     'optimizer_weigth_decay': [i/10000 for i in range(11)],
#     'learning_rate': round_to_first_non_zero([i/100000 for i in range_inc(0, 100000, 1, 10)]),
#     'pos_weight': [1.0, train_uw_ratio]
# }
# feature_search_space = {
#     'stopwords_type': ['wstop', 'wostop'],
#     'from_selection': [True, False],
#     'pos_feature_type': ['sum', 'onehot', 'none'],
#     'tag_feature_type': ['sum', 'onehot', 'none'],
#     'word_count_feature_type': ['count_words', 'none'],
# #     'word_level_feature_type': ['dep', 'triplet']
# }
# # print(search_space)
# params = {
#     'batch_size': 32,
#     'pooling_strategy': 'second_last',
#     'dropout': 0.39,
#     'hidden_dim': 256,
#     'w_seq': True,
#     'lr': 0.004118121,
#     'opt_weight_decay': 0.024460049,
#     'pos_weight': train_uw_ratio,
# }