In [1]:
import sys
import logging
import os.path as p
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import print_n_log

from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data.dataloader import DataLoader
from definitions import *
from model_helper_functions import *
from dataset_helper_functions import *
from sent_nn import SentNN

from sentence_transformers import SentenceTransformer

from debates_dataset import DebatesDataset
from early_stopping import EarlyStopping
from optuna.trial import TrialState
from torchvision import transforms
# my transforms
from transforms import *
from scorer.task5 import evaluate_v2

In [2]:
data = {}
optim_path = os.path.join(EXP_DIR_PATH, 'sent-nn', 'optimization')
# bi-lstm
# - no_feat
# - sent_feat
# - word_feat
training_path = os.path.join(EXP_DIR_PATH, 'sent-nn', 'training')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_uw_ratio = 0
slf_dim = 0


In [3]:
# studies = [m for m in os.listdir(optim_path) if m.split('_')[-1] != 'params.pkl']
# models_directories = []
# for m in studies:
#     sp = m.split('_')[1:]
#     xx = '_'.join([s for s in sp if s not in {'pNone', 'df0.2', 'wf0.2.pkl'}])
#     if len(xx):
#         models_directories.append(xx)

# for c in models_directories:
#     try:
#         os.mkdir(os.path.join(training_path, c))
#     except Exception as e:
#         print(e.args)

Function for loading data.

In [4]:
def load_data():
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [
            p.join(dev_path, 'dev.tsv'),
        ],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
        ],
        'val': [
            p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        ],
    }

    for dtype, dpaths in data_paths.items():
        try:
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

        except Exception as e:
            print(e.args)
            exit()
    
    if training_on_weak[0]:
        if training_on_weak[1] == 'balanced_original':
            data['train'], _ = weak_data_merge(merge_type=training_on_weak[1])
        else:
            data['train'], data['val'] = weak_data_merge(merge_type=training_on_weak[1])

Datasets and DataLoaders, takes trial as input to be able to suggest values for variables.

In [5]:
def get_loaders(batch_size, transforms_params=None, stopwords_type=None):
    global train_uw_ratio, slf_dim

    transform_pipeline = None

    if transforms_params is not None:
        transforms_map = {
            'sum': Sum,
            'onehot': OneHot,
            'none': NoTransform
        }
        cw_map = {
            'count_words': CountWords,
            'none': NoTransform
        }

        from_sel = transforms_params['from_selection']
  
        pos_feat = transforms_map[transforms_params['pos_feature_type']]
        pos_feat = pos_feat(
            'pos', from_selection=from_sel, stopwords=stopwords_type
        )

        tag_feat = transforms_map[transforms_params['tag_feature_type']]
        tag_feat = tag_feat(
            'tag', from_selection=from_sel, stopwords=stopwords_type
        )
        
        dep_feat = transforms_map[transforms_params['dep_feature_type']]
        dep_feat = dep_feat(
            'dep', from_selection=from_sel, stopwords=stopwords_type
        )

        cw_feat = cw_map[transforms_params['word_count_feature_type']]
        cw_feat = cw_feat()
        
        transform_pipeline = transforms.Compose([
            HandleStopwords(stopwords=stopwords_type),
            pos_feat,
            tag_feat,
            dep_feat,
            cw_feat,
            ToBinary(6),
            ToTensor()
        ])
    
    print(transform_pipeline)
    train = data['train']
    worthy_train = train[train['label'] == 1]
    train_uw_ratio = (len(train) - len(worthy_train)) / len(worthy_train)
    print(train_uw_ratio)
    
    train_dd = DebatesDataset(data=data['train'], transform=transform_pipeline)
    val_dd = DebatesDataset(data=data['val'], transform=transform_pipeline)
    test_dd = DebatesDataset(data=data['test'], transform=transform_pipeline)
    
    if transform_pipeline:
        slf_dim = train_dd[0][-1].size()[0]

    train_loader = DataLoader(train_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dd, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dd, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_loader, val_loader, test_loader

Model setup + training loop

In [6]:
def train_model(params, features_params=None, model_checkpoint_path=None):
    global logf_path
    # this is here so that it can be accessed here and in get_loaders()
    stopwords_type = None
    if features_params:
        stopwords_type = features_params['stopwords_type'] if 'stopwords_type' in features_params else None

    train_loader, val_loader, test_loader = get_loaders(
        params['batch_size'],
        transforms_params=features_params,
        stopwords_type=stopwords_type
    )

    # best for given trial
#     pooling_strategy = params['pooling_strategy']
    dropout = params['dropout']
#     hidden_dim = params['hidden_dim']
#     n_hidden_layers = params['n_hidden_layers']
    lr = params['learning_rate']
    opt_weight_decay = params['optimizer_weigth_decay']
    pos_weight = train_uw_ratio if params['pos_weight'] > 1.0 else 1.0
    
#     fnn_hidden_dim = params['fnn_hidden_dim']
#     fnn_n_layers = params['fnn_n_hidden_layers']
#     fnn_dropout = params['fnn_dropout']
    
#     pnn_hidden_dim = params['pnn_hidden_dim']
#     pnn_dropout = params['pnn_dropout']
       
    emb_model_name = params['embedding_model_name']
    emb_size_map = {
        'all-mpnet-base-v2': 768,
        'all-MiniLM-L6-v2': 384,
        'multi-qa-mpnet-base-dot-v1': 768
    }
    # emb_model_name = 'all-MiniLM-L6-v2'
    embedding_model = SentenceTransformer(emb_model_name, device=device, cache_folder=SBERT_MODEL_PATH)
    
    model = SentNN(
        embeddings_dim=emb_size_map[emb_model_name],
        sentence_level_feature_dim=slf_dim,
        dropout=dropout,
    ).to(device)    

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)

    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
#     criterion = nn.SmoothL1Loss(reduction='sum')

    early_stopping = EarlyStopping(
        patience=6,
        path=model_checkpoint_path,
        verbose=False,
        trace_func=print_n_log.run('early_stopping', logf_path, 'DEBUG')
    )

    n_epochs = 30
    threshold = 0.5
    val_losses, train_losses, val_clf_reports, train_clf_reports = [], [], [], []

    for epoch in range(n_epochs):
        epoch_val_losses, epoch_train_losses = [], []

        model.train()
        y_pred, y_true = [], []
        for ids, sentences, labels, features in train_loader:
            labels = labels.float().to(device)
            features = features.to(device)
            
            embeddings = embedding_model.encode(sentences, convert_to_tensor=True, show_progress_bar=False).float()
            output = model(embeddings, sent_level_features=features)
            loss = criterion(output, labels)
        
            loss.backward()
            epoch_train_losses.append(loss.item())

            pred = torch.sigmoid(output)
            pred = (pred > threshold).int()
            y_pred.extend(pred.tolist())
            y_true.extend(labels.tolist())
            
            optimizer.step()
            optimizer.zero_grad()
            
        cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
        train_clf_reports.append(cr)
        
        
        model.eval()
        y_pred, y_true = [], []
        with torch.no_grad():
            for val_ids, val_sentences, val_labels, val_features in val_loader:
                val_labels = val_labels.float().to(device)
                val_features = val_features.to(device)
                
                val_embeddings = embedding_model.encode(val_sentences, convert_to_tensor=True, show_progress_bar=False).float()
                pred = model(val_embeddings, sent_level_features=val_features)
                val_loss = criterion(pred, val_labels)
                epoch_val_losses.append(val_loss.item())
                
                pred = torch.sigmoid(pred)
                
                pred = (pred > threshold).int()
                y_pred.extend(pred.tolist())
                y_true.extend(val_labels.tolist())
        
        val_losses.append(np.average(epoch_val_losses))
        train_losses.append(np.average(epoch_train_losses))
        avg_val_loss = np.average(val_losses)
        print(
            'epoch ==> ', epoch,
            ' | avg train loss ==> ', np.average(train_losses),
            ' | avg val loss ==> ', avg_val_loss
        )
        print(classification_report(y_true, y_pred, digits=6, zero_division=0))
        cr = classification_report(y_true, y_pred, digits=6, output_dict=True, zero_division=0)
        val_clf_reports.append(cr)
        
        early_stopping(
            val_loss=avg_val_loss,
            model=model,
            optimizer=optimizer,
            train_losses=train_losses,
            val_losses=val_losses,
            train_clf_reports=train_clf_reports,
            val_clf_reports=val_clf_reports,
            acomp_metrics=('f1_p', cr['1.0']['f1-score'])
        )
        
        if early_stopping.early_stop:
            print('early stopping...')
            break

    # recall_p = early_stopping.acomp_metrics['recall_p'] if early_stopping.acomp_metrics else 0.0
    "Done."
#     return model, 

In [7]:
def split_id(item: str):
    item = str(int(item))
    return int(item[:8]), int(item[8:])

def evaluate_model(params, features_params=None, load_path=None, bam=''):
    stopwords_type = None
    if features_params:
        stopwords_type = features_params['stopwords_type'] if 'stopwords_type' in features_params else None
        
    train_loader, val_loader, test_loader = get_loaders(
        params['batch_size'],
        transforms_params=features_params,
        stopwords_type=stopwords_type
    )

    # best for given trial
#     pooling_strategy = params['pooling_strategy']
    dropout = params['dropout']
#     hidden_dim = params['hidden_dim']
#     n_hidden_layers = params['n_hidden_layers']
    lr = params['learning_rate']
    opt_weight_decay = params['optimizer_weigth_decay']
    pos_weight = train_uw_ratio if params['pos_weight'] > 1.0 else 1.0
    print('pos: ', pos_weight)
    
#     fnn_hidden_dim = params['fnn_hidden_dim']
#     fnn_n_layers = params['fnn_n_hidden_layers']
#     fnn_dropout = params['fnn_dropout']
    
#     pnn_hidden_dim = params['pnn_hidden_dim']
#     pnn_dropout = params['pnn_dropout']
    
    emb_model_name = params['embedding_model_name']

    
    emb_size_map = {
        'all-mpnet-base-v2': 768,
        'all-MiniLM-L6-v2': 384,
        'multi-qa-mpnet-base-dot-v1': 768
    }
    # emb_model_name = 'all-MiniLM-L6-v2'
    embedding_model = SentenceTransformer(emb_model_name, device=device, cache_folder=SBERT_MODEL_PATH)
    
    model = SentNN(
        embeddings_dim=emb_size_map[emb_model_name],
        sentence_level_feature_dim=slf_dim,
        dropout=dropout
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=opt_weight_decay)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight]).to(device))
#     criterion = nn.SmoothL1Loss(reduction='sum')
    
    load_checkpoint(load_path, model, optimizer, device, bam=bam)
    
    threshold = 0.5
    y_pred = []
    y_true = []
    scores = []
    ids = []

    model.eval()
    with torch.no_grad():
        for test_ids, test_sentences, test_labels, test_features in test_loader:           
            test_labels = test_labels.float().to(device)
            test_features = test_features.to(device)

            embeddings = embedding_model.encode(test_sentences, convert_to_tensor=True, show_progress_bar=False).float()
            output = torch.sigmoid(model(embeddings, sent_level_features=test_features))
            
            ids.extend(test_ids.tolist())
            scores.extend(output.tolist())
            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(test_labels.tolist())
    
#     df_scorer = pd.DataFrame(data=list(zip(ids, scores)), columns=['id', 'score'])
#     df_date_index = df_scorer.apply(lambda row: split_id(row.id), axis=1, result_type='expand')
#     df_scorer = pd.concat([df_date_index, df_scorer], axis=1) \
#         .rename(columns={0: 'date', 1: 'index'}) \
#         .sort_values(by=['date', 'index'], axis=0) \
#         .reset_index(drop=True)
    predictions = list(zip(ids, scores))
    predictions = sorted(
        predictions, 
        key=lambda x: x[0]
    )
    _, _, avg_precision, rr, num_relevant = evaluate_v2(predictions)
    print('Avg. precision: ', avg_precision)
    print('Classification Report:')
    print(classification_report(y_true, y_pred, digits=4)) #

In [8]:
# load_data()

In [13]:
# study_path = os.path.join(optim_path, 'bi-lstm_wAtt_sTPE_pNone_df0.2_wf0.2.pkl')
# params_path = os.path.join(optim_path, 'bi-lstm_featOptim_wAtt_sTPE_pNone_df0.2_wf0.2_params.pkl')
# model_checkpoint_path = os.path.join(training_path, 'wAtt_sTPE')
# sent_nn_mF1_sTPE_pNone_df0.2_wf0.03
# sent_nn_paramOptAfter_sentFeat_mF1_sTPE_pNone_df0.2_wf0.03
model_optim_path = os.path.join(optim_path, 'sent_nn_NO_FEAT_V2_mF1_sTPE_pNone_df0.2_wf0.03')
# feature_params_path = f'{model_optim_path}_featureParams.pkl'
# params_path = f'{model_optim_path}_params.pkl'
study_path = f'{model_optim_path}.pkl'


# for now ignore features 
# studies = [s for s in studies if '_params' not in s and 'featOptim' not in s]
is_training = True
training_on_weak = (True, 'balanced_original')
load_data()
# bam = 'best_f1_p_'
# bam = ''
# for study_name in studies[:1]:
# TODO: check whether all of these exist
study = torch.load(os.path.join(optim_path, study_path))
params = study.best_params
print(params)
features_params = None

# features_params =  {'stopwords_type': 'wstop', 'from_selection': False, 'tag_feature_type': 'sum', 'word_count_feature_type': 'count_words'}
# features_params = {}
print(features_params)
# todo: tu treba este w_feat!! 15:32 21.04.
# del features_params['tag_feature_type']
# del features_params['word_count_feature_type']
# print(features_params)

checkpoint_dir = 'no_feat_weak_balanced_original'
# checkpoint_dir = [d for d in models_directories if d in study_name][0]
study_log_name = 'sent-nn_noFeat_weakBalancedOriginal'
# params = study.best_params
logf_path = p.join(LOG_DIR_PATH, f'training_{study_log_name}.log')


if features_params:
    for ft in ['pos', 'tag', 'dep', 'word_count', 'word_level']:
        feature_type = f'{ft}_feature_type'
        if feature_type not in features_params:
            features_params[feature_type] = 'none'

if 'with_sequential_layer' not in params:
    params['with_sequential_layer'] = False

if isinstance(params['pos_weight'], bool):
    params['pos_weight'] = 2.0 if params['pos_weight'] else 1.0

checkpoint_path = os.path.join(training_path, checkpoint_dir)
if is_training:
    train_model(params, features_params, checkpoint_path)
else:
    evaluate_model(params, features_params, checkpoint_path, bam=bam)

INFO : Load pretrained SentenceTransformer: all-mpnet-base-v2


{'batch_size': 64, 'dropout': 0.02, 'learning_rate': 0.00687658226159879, 'optimizer_weigth_decay': 0.00015098895105608427, 'pos_weight': 32.81159420289855, 'embedding_model_name': 'all-mpnet-base-v2'}
None
None
1.0


KeyboardInterrupt: 

### results
- ***no_feat:***
    - **last checkpoint:**
    ```
        Avg. precision:  0.13700088844162206
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9927    0.8118    0.8932      6328
                 1.0     0.0760    0.7206    0.1375       136

            accuracy                         0.8099      6464
           macro avg     0.5343    0.7662    0.5154      6464
        weighted avg     0.9734    0.8099    0.8773      6464
    ```
    - **best f1 checkpoint:**
    ```
        Avg. precision:  0.15217500236398074
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9930    0.8271    0.9025      6329
                 1.0     0.0822    0.7259    0.1477       135

            accuracy                         0.8250      6464
           macro avg     0.5376    0.7765    0.5251      6464
        weighted avg     0.9740    0.8250    0.8867      6464
    ```
    - **last checkpoint - weak_simple:**
    ```
        Avg. precision:  0.12957222811302888
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9939    0.6971    0.8194      6328
                 1.0     0.0538    0.8015    0.1008       136

            accuracy                         0.6993      6464
           macro avg     0.5239    0.7493    0.4601      6464
        weighted avg     0.9741    0.6993    0.8043      6464
    ```
    - **best f1 checkpoint - weak_simple:**
    ```
        Avg. precision:  0.12082746852589468
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9939    0.7159    0.8323      6329
                 1.0     0.0562    0.7926    0.1049       135

            accuracy                         0.7175      6464
           macro avg     0.5250    0.7543    0.4686      6464
        weighted avg     0.9743    0.7175    0.8171      6464
    ```
    - **last checkpoint - weak_balanced_result:**
    ```
        Avg. precision:  0.11848342811893009
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9944    0.7034    0.8240      6328
                 1.0     0.0558    0.8162    0.1045       136

            accuracy                         0.7058      6464
           macro avg     0.5251    0.7598    0.4642      6464
        weighted avg     0.9747    0.7058    0.8088      6464
    ```
    - **best f1 checkpoint - weak_balanced_result:**
    ```
        Avg. precision:  0.11507593855680112
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9938    0.7073    0.8264      6328
                 1.0     0.0551    0.7941    0.1031       136

            accuracy                         0.7092      6464
           macro avg     0.5244    0.7507    0.4647      6464
        weighted avg     0.9740    0.7092    0.8112      6464
    ```
    - **last checkpoint - weak_balanced_original:**:
    ```
        Avg. precision:  0.07622417503347415
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9913    0.7396    0.8471      6328
                 1.0     0.0545    0.6985    0.1011       136

            accuracy                         0.7387      6464
           macro avg     0.5229    0.7190    0.4741      6464
        weighted avg     0.9716    0.7387    0.8314      6464
    ```
    - **best f1 checkpoint - weak_balanced_original:**\
    ```
        --
    ```
    
- ***w_feat:***
    - **last checkpoint:**
    ```
        Avg. precision:  0.12910844453372206
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9911    0.8110    0.8921      6328
                 1.0     0.0700    0.6618    0.1266       136

            accuracy                         0.8079      6464
           macro avg     0.5306    0.7364    0.5093      6464
        weighted avg     0.9717    0.8079    0.8760      6464
    ```
    
    - **best f1 checkpoint:**
    ```
        Avg. precision:  0.12333415169590956
        Classification Report:
                      precision    recall  f1-score   support

                 0.0     0.9912    0.8151    0.8946      6328
                 1.0     0.0714    0.6618    0.1289       136

            accuracy                         0.8119      6464
           macro avg     0.5313    0.7384    0.5117      6464
        weighted avg     0.9718    0.8119    0.8784      6464
    ```



In [10]:
# TODO: this is kept here just in case
# optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# needed for GridSampler
# search_space = {
#     'batch_size': [16, 32, 64],
#     'pooling_strategy': ['last_four', 'last_four_sum', 'second_last'],
# #     'should_scale_emb': [False, True],
#     'dropout': [i/100 for i in range(0, 51, 5)],
#     'hidden_dim': [128, 256, 512],
#     'optimizer_weigth_decay': [i/10000 for i in range(11)],
#     'learning_rate': round_to_first_non_zero([i/100000 for i in range_inc(0, 100000, 1, 10)]),
#     'pos_weight': [1.0, train_uw_ratio]
# }
# feature_search_space = {
#     'stopwords_type': ['wstop', 'wostop'],
#     'from_selection': [True, False],
#     'pos_feature_type': ['sum', 'onehot', 'none'],
#     'tag_feature_type': ['sum', 'onehot', 'none'],
#     'word_count_feature_type': ['count_words', 'none'],
# #     'word_level_feature_type': ['dep', 'triplet']
# }
# # print(search_space)
# params = {
#     'batch_size': 32,
#     'pooling_strategy': 'second_last',
#     'dropout': 0.39,
#     'hidden_dim': 256,
#     'w_seq': True,
#     'lr': 0.004118121,
#     'opt_weight_decay': 0.024460049,
#     'pos_weight': train_uw_ratio,
# }