In [1]:
import sys
import logging
import os.path as p
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import print_n_log

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data.dataloader import DataLoader
from definitions import *
from model_helper_functions import *
from dataset_helper_functions import *
from bi_lstm import BiLSTM
from bert_embedding_model import BertEmbeddingModel
from debates_dataset import DebatesDataset
from early_stopping import EarlyStopping
from optuna.trial import TrialState
from torchvision import transforms
# my transforms
from transforms import *

In [2]:
data = {}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_uw_ratio = 0
dataset_frac = 0.2
worthy_frac = 0.2
rs = 22

Function for loading data.

In [3]:
def load_data():
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [
            p.join(dev_path, 'dev.tsv'),
        ],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
        ],
        'val': [
            p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        ],
    }

    for dtype, dpaths in data_paths.items():
        try:
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

        except Exception as e:
            print(e.args)
            exit()

Datasets and DataLoaders, takes trial as input to be able to suggest values for variables.

In [4]:
def get_loaders(params):
    global train_uw_ratio
    # dev_df, test_df, train_df, val_df = data.values()
    subsets = {}
    for k, df in data.items():

        n_subset = int(len(df)*dataset_frac)

        worthy_df = df.loc[df['label'] == 1]
        n_worthy = min(int(n_subset*worthy_frac), len(worthy_df))
        worthy_df = worthy_df.sample(n=n_worthy, random_state=rs)

        unworthy_df = df.loc[df['label'] == 0].sample(
            n=n_subset-n_worthy,
            random_state=rs
        )
        if k == 'train':
            train_uw_ratio = len(unworthy_df) / len(worthy_df)
            print('train uw ratio: ', train_uw_ratio)
        # sample(frac=1.0) -> shuffle
        subsets[k] = worthy_df.append(unworthy_df).sample(frac=1.0, random_state=rs, ignore_index=True)
    


    # transform_pipeline = transforms.Compose([
    #     Sum('pos', stopwords='wostop'),
    #     Sum('tag', stopwords='wostop'),
    #     ToBinary(6),
    #     ToTensor()
    # ])
    transform_pipeline = None

    train_dd = DebatesDataset(data=subsets['train'], transform=transform_pipeline)
    val_dd = DebatesDataset(data=subsets['val'], transform=transform_pipeline)
    test_dd = DebatesDataset(data=subsets['test'], transform=transform_pipeline)

#     batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    batch_size = params['batch_size']
    train_loader = DataLoader(train_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dd, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_loader, val_loader, test_loader

In [5]:
def get_default_params():
    return {
        'batch_size': 32,
        'pooling_strategy': 'second_last',
        'dropout': 0.2,
        'hidden_dim': 128,
        'learning_rate': 0.001,
        'optimizer_weigth_decay': 0,
        'pos_weight': None
    }

Model setup + training loop

In [6]:
def objective(params):
    params = params if params else get_default_params()

    train_loader, val_loader, test_loader = get_loaders(params)
    # hyperparams opt
#     pooling_strategy = trial.suggest_categorical('pooling_strategy', ['last_four', 'last_four_sum', 'second_last'])
#     should_scale_emb = trial.suggest_categorical('should_scale_emb', [True, False])
    pooling_strategy = params['pooling_strategy']
    should_scale_emb = False
    embedding_model = BertEmbeddingModel(
        device=device,
        pooling_strat=pooling_strategy,
        scale=should_scale_emb
    )
    
#     dropout = trial.suggest_float('dropout', 0.0, 0.5, step=0.01)
#     lstm_dropout = trial.suggest_float('lstm_dropout', 0.0, 0.3, step=0.05)
#     hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256])
    dropout = params['dropout']
    hidden_dim = params['hidden_dim']
    model = BiLSTM(
        dropout=dropout,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_model.dim,
        sent_level_feature_dim=0
    ).to(device)    

#     lr = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
#     opt_weight_decay = trial.suggest_float('optimizer_weigth_decay', 1e-5, 1e-1, log=True)
    lr = params['learning_rate']
    opt_weight_decay = params['optimizer_weigth_decay']
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)

#     pos_weight = trial.suggest_categorical('pos_weight', [1.0, train_uw_ratio])
    pos_weight = torch.tensor([train_uw_ratio]).to(device) if params['pos_weight'] else None
    criterion = nn.BCEWithLogitsLoss(pos_weight=None) #$torch.tensor([pos_weight]).to(device)
    early_stopping = EarlyStopping(
        patience=5,
        path=None,
        verbose=True,
#         trace_func=print_n_log.run('early_stopping', p.join(LOG_DIR_PATH, 'test.log'), 'DEBUG')
    )
    n_epochs = 20
    threshold = 0.5
    # training
    for epoch in range(n_epochs):
        losses, val_losses = [], []
        model.train()
        for ids, sentences, labels, features in train_loader:
            labels = labels.float().to(device)
            
            embeddings, lengths = embedding_model(sentences)
            output = model(embeddings, lengths)
            loss = criterion(output, labels)
            
            loss.backward()
            losses.append(loss.item())
#             print('train loss: ', losses[-1]/len(train_loader))

            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        y_pred, y_true = [], []
        with torch.no_grad():
            for val_ids, val_sentences, val_labels, val_features in val_loader:
                val_labels = val_labels.float().to(device)
                
                val_embeddings, val_lengths = embedding_model(val_sentences)
                pred = model(val_embeddings, val_lengths)
                val_loss = criterion(pred, val_labels)
                val_losses.append(val_loss.item())
                
#                 print('val loss: ', val_losses[-1]/len(val_loader))
                pred = torch.sigmoid(pred)
                
                pred = (pred > threshold).int()
                y_pred.extend(pred.tolist())
                y_true.extend(val_labels.tolist())
        
        val_loss = np.average(val_losses)
        print('epoch: ', epoch)
        print('avg train loss: ', np.average(losses))
        print('avg val loss: ', val_loss)
        print(classification_report(y_true, y_pred, digits=6))
        cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
#         recall_p = cr['1.0']['recall']
        
        val_loss = np.average(val_losses)
        early_stopping(val_loss, model, acomp_metrics={'recall_p': cr['1.0']['recall']})
        
        if early_stopping.early_stop:
            print("Early stopping")
            break

#         trial.report(recall_p, epoch)

#         # Handle pruning based on the intermediate value.
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()
    
    "Done."
    print(early_stopping.acomp_metrics)
    return model, embedding_model

In [7]:
load_data()
params={
    'batch_size': 32,
    'pooling_strategy': 'second_last',
    'dropout': 0.09,
    'hidden_dim': 256,
    'learning_rate': 0.0017237971142770967,
    'optimizer_weigth_decay': 0.00030000000000000003,
    'pos_weight': 5.902366863905326
}
model, embedding_model = objective(params)


train uw ratio:  5.902366863905326


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch:  0
avg train loss:  0.3592433946000205
avg val loss:  0.3127411330739657
              precision    recall  f1-score   support

         0.0   0.885027  0.992504  0.935689       667
         1.0   0.750000  0.148515  0.247934       101

    accuracy                       0.881510       768
   macro avg   0.817513  0.570509  0.591811       768
weighted avg   0.867269  0.881510  0.845242       768

Validation loss decreased (inf --> 0.312741).  Saving model ...
epoch:  1
avg train loss:  0.29765696440719897
avg val loss:  0.3362641651183367
              precision    recall  f1-score   support

         0.0   0.916042  0.917417  0.916729       666
         1.0   0.455446  0.450980  0.453202       102

    accuracy                       0.855469       768
   macro avg   0.685744  0.684199  0.684966       768
weighted avg   0.854869  0.855469  0.855167       768

EarlyStopping counter: 1 out of 5
epoch:  2
avg train loss:  0.25671849249758655
avg val loss:  0.3101869026819865
      

In [8]:
# subsets = {}
# for k, df in data.items():
#     print(k, len(df))
#     n_subset = int(len(df)*dataset_frac)
#     print(n_subset)
#     worthy_df = df.loc[df['label'] == 1]
#     n_worthy = min(int(n_subset*worthy_frac), len(worthy_df))
#     print(n_worthy)
#     worthy_df = worthy_df.sample(n=n_worthy, random_state=rs)
#     print(n_subset-n_worthy)
#     unworthy_df = df.loc[df['label'] == 0].sample(
#         n=n_subset-n_worthy,
#         random_state=rs
#     )
#     if k == 'train':
#         train_uw_ratio = len(unworthy_df) / len(worthy_df)
#     # sample(frac=1.0) -> shuffle
#     subsets[k] = worthy_df.append(unworthy_df).sample(frac=1.0, random_state=rs, ignore_index=True)
# # print([(k, len(df)) for k, df in subsets.items()])

In [9]:
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
# study = optuna.create_study(
#     study_name=f'bi-lstm_df{dataset_frac}_wf{worthy_frac}',
#     sampler=optuna.samplers.RandomSampler(rs),
#     pruner=optuna.pruners.MedianPruner(),
#     direction='maximize'
# )
# study.optimize(objective, n_trials=100)

# torch.save(study, os.path.join(EXP_DIR_PATH, 'bi-lstm', 'optimization', f'{study.study_name}.pkl'))

In [12]:
train_loader, val_loader, test_loader = get_loaders(params)
threshold = 0.5
y_pred = []
y_true = []
temp = []

model.eval()
with torch.no_grad():
    for test_ids, test_sentences, test_labels, test_features in test_loader:           
        test_labels = test_labels.float().to(device)

        embeddings, lengths = embedding_model(test_sentences)
        output = torch.sigmoid(model(embeddings, lengths))

        temp.extend(output.tolist())
        output = (output > threshold).int()
        y_pred.extend(output.tolist())
        y_true.extend(test_labels.tolist())

print('Ranking:')
temp.sort(reverse=True)
print(temp[:10])

# print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
# print(f'precision: {tp/(tp+fp)}')
# print(f'recall: {tp/(tp+fn)}')

print('Classification Report:')
print(classification_report(y_true, y_pred, digits=4)) # labels=[1,0], 

# cm = confusion_matrix(y_true, y_pred)
# ax = plt.subplot()
# sns.heatmap(cm, annot=True, ax=ax, cmap='Blues', fmt="d")

# ax.set_title('Confusion Matrix')

# ax.set_xlabel('Predicted Labels')
# ax.set_ylabel('True Labels')

# ax.xaxis.set_ticklabels(['worthy', 'unworthy'])
# ax.yaxis.set_ticklabels(['worthy', 'unworthy'])

train uw ratio:  5.902366863905326
Ranking:
[0.9996585845947266, 0.9994238615036011, 0.9988390803337097, 0.9974378347396851, 0.9971519708633423, 0.9971467852592468, 0.9958751797676086, 0.9957247972488403, 0.9949931502342224, 0.994152843952179]
Classification Report:
              precision    recall  f1-score   support

         0.0     0.9286    0.9278    0.9282      1150
         1.0     0.3664    0.3692    0.3678       130

    accuracy                         0.8711      1280
   macro avg     0.6475    0.6485    0.6480      1280
weighted avg     0.8715    0.8711    0.8713      1280



NameError: name 'plt' is not defined