### Transformer Seq2Seq (Neural Signals to Bi-grams)

This notebook contains code for running a 2-word seq2seq Transformer where the neural signals are sent through the encoder while the corresponding bi-grams are sent through the decoder.

Set the seed for reproducibility. For more info read https://pytorch.org/docs/stable/notes/randomness.html and https://discuss.pytorch.org/t/random-seed-initialization/7854/18

In [1]:
import json
import math
import os
import random
import sys
import time
import warnings
from collections import Counter
from datetime import datetime
from pprint import pprint

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data
from transformers import AdamW

from arg_parser import arg_parser
from build_matrices import (build_design_matrices_classification,
                            build_design_matrices_seq2seq)
from config import build_config
from dl_utils import Brain2enDataset, MyCollator
from models import PITOM, ConvNet10, MeNTAL, MeNTALmini
from train_eval import plot_training, train, valid
from eval_utils import evaluate_roc, evaluate_topk
from vocab_builder import get_sp_vocab, get_std_vocab, get_vocab

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

In [3]:
results_folder = '20200531-ipynb'

In [5]:
args = arg_parser(['--subjects', '625',
                   '--max-electrodes', '55',
                   '--vocab-min-freq', '10',
                   '--vocab-max-freq', '250',
                  '--epochs', '50'])
CONFIG = build_config(args, results_folder)
args.gpus = min(args.gpus, torch.cuda.device_count())

Subject: 625
Training Data:: Number of Conversations is: 63
Validation Data:: Number of Conversations is: 13


In [6]:
# Model objectives
MODEL_OBJ = {
    "ConvNet10": "classifier",
    "PITOM": "classifier",
    "MeNTALmini": "classifier",
    "MeNTAL": "seq2seq"
}

# GPUs
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
args.gpus = min(args.gpus, torch.cuda.device_count())

# Fix random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

args.model = args.model.split("_")[0]
classify = False if (args.model in MODEL_OBJ
                     and MODEL_OBJ[args.model] == "seq2seq") else True

In [7]:
CONFIG = build_config(args, results_folder)

Subject: 625
Training Data:: Number of Conversations is: 63
Validation Data:: Number of Conversations is: 13


In [8]:
word2freq, word_list, n_classes, vocab, i2w = get_std_vocab(
    CONFIG, comprehension=False, classify=classify)

# Conversations: 63
Vocabulary size (min_freq=10): 338
Saving word counter


### Some insights about the bigrams in the training set

In [9]:
print("Loading training data")
x_train, y_train = build_design_matrices_seq2seq(
    'train', CONFIG, vocab, delimiter=" ", aug_shift_ms=[-1000, -500], max_num_bins=60)

# print("Loading validation data")
# x_valid, y_valid = build_design_matrices_seq2seq(
#     'valid', CONFIG, vocab, delimiter=" ", aug_shift_ms=[], max_num_bins=60, remove_unks=False)

print("Loading validation data")
x_valid, y_valid = build_design_matrices_seq2seq(
    'valid', CONFIG, vocab, delimiter=" ", aug_shift_ms=[], max_num_bins=60, remove_unks=True)

Loading training data
Maximum Sequence Length (Preset): 60
Number of train samples is: 4883
Number of train labels is: 4883
Maximum Sequence Length: 60
Loading validation data
Maximum Sequence Length (Preset): 60
Number of valid samples is: 985
Number of valid labels is: 985
Maximum Sequence Length: 60


In [32]:
def replace_words(data):
    df_y_train = pd.DataFrame(data)
    df_y_train[1].replace(i2w, inplace=True)
    df_y_train[2].replace(i2w, inplace=True)

    return df_y_train


def bigram_freq_excel(data, word2freq, i2w, filename, ref_data=None):
    valid_df = replace_words(data)
    valid_df = valid_df.groupby([1, 2]).size().reset_index(name='Count')
    valid_df['BF1'] = valid_df[1].replace(dict(valid_df[1].value_counts()))
    valid_df['BF2'] = valid_df[2].replace(dict(valid_df[2].value_counts()))
    valid_df['VF1'] = valid_df[1].replace(word2freq)
    valid_df['VF2'] = valid_df[2].replace(word2freq)

    if ref_data is not None:
        valid_df = valid_df.merge(ref_data, on=[1, 2], suffixes=('_valid', '_train'), how='left') 
        
    valid_df.to_excel(os.path.join(CONFIG["SAVE_DIR"], filename), index=False)
        
    print(len(valid_df[1].unique()))
    print(len(valid_df[2].unique()))

#     print(set(word2freq.keys()) - set(valid_df[1].unique()))
#     print(set(word2freq.keys()) - set(valid_df[2].unique()))
    
    return valid_df


train_df = bigram_freq_excel(y_train, word2freq, i2w, "625_bi-gram-freq-train.xlsx")
valid_df = bigram_freq_excel(y_valid, word2freq, i2w, "625_bi-gram-freq-valid.xlsx", ref_data=train_df)

264
269
138
139


In [33]:
valid_df[1]

Unnamed: 0,1,2,Count_valid,BF1_valid,BF2_valid,VF1_valid,VF2_valid,Count_train,BF1_train,BF2_train,VF1_train,VF2_train
0,actually,feel,1,2,1,50,96,,,,,
1,actually,kinda,1,2,2,50,60,,,,,
2,again,or,1,1,5,34,74,,,,,
3,ago,or,1,1,5,23,74,,,,,
4,all,or,1,1,5,116,74,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
242,you'll,come,1,1,3,12,34,,,,,
243,you're,gonna,1,3,3,83,96,1.0,8.0,3.0,83.0,96.0
244,you're,not,1,3,7,83,180,3.0,8.0,10.0,83.0,180.0
245,you're,saying,2,3,1,83,26,1.0,8.0,4.0,83.0,26.0


#### Converting train and validation data to Loader objects

In [11]:
train_ds = Brain2enDataset(x_train, y_train)
print("Number of training signals: ", len(train_ds))
valid_ds = Brain2enDataset(x_valid, y_valid)
print("Number of validation signals: ", len(valid_ds))

Skipped 0 examples
Number of training signals:  1343
Skipped 0 examples
Number of validation signals:  282


In [12]:
my_collator = MyCollator(CONFIG, vocab)
train_dl = data.DataLoader(train_ds,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=CONFIG["num_cpus"],
                           collate_fn=my_collator)
valid_dl = data.DataLoader(valid_ds,
                           batch_size=args.batch_size,
                           num_workers=CONFIG["num_cpus"],
                           collate_fn=my_collator)

#### Creating a Model

In [13]:
DEFAULT_MODELS = {
    "ConvNet10": (len(vocab), ),
    "PITOM": (len(vocab), sum(args.max_electrodes)),
    "MeNTALmini":
    (sum(args.max_electrodes), len(vocab), args.tf_dmodel, args.tf_nhead,
     args.tf_nlayer, args.tf_dff, args.tf_dropout),
    "MeNTAL": (sum(args.max_electrodes), len(vocab), args.tf_dmodel,
               args.tf_nhead, args.tf_nlayer, args.tf_dff, args.tf_dropout)
}

# Create model
if args.init_model is None:
    if args.model in DEFAULT_MODELS:
        print("Building default model: %s" % args.model, end="")
        model_class = globals()[args.model]
        model = model_class(*(DEFAULT_MODELS[args.model]))
    else:
        print("Building custom model: %s" % args.model, end="")
        sys.exit(1)
else:
    model_name = "%s%s.pt" % (SAVE_DIR, args.model)
    if os.path.isfile(model_name):
        model = torch.load(model_name)
        model = model.module if hasattr(model, 'module') else model
        print("Loaded initial model: %s " % args.model)
    else:
        print("No models found in: ", SAVE_DIR)
        sys.exit(1)
print(" with %d trainable parameters" %
      sum([p.numel() for p in model.parameters() if p.requires_grad]))
sys.stdout.flush()

Building default model: MeNTAL with 397714 trainable parameters


In [14]:
criterion = nn.CrossEntropyLoss()
step_size = int(math.ceil(len(train_ds) / args.batch_size))
optimizer = AdamW(model.parameters(),
                  lr=args.lr,
                  weight_decay=args.weight_decay)
scheduler = None

In [15]:
print("Training on %d GPU(s) with batch_size %d for %d epochs" %
      (args.gpus, args.batch_size, args.epochs))
print("=" * CONFIG["print_pad"])
sys.stdout.flush()

Training on 0 GPU(s) with batch_size 48 for 5 epochs


In [16]:
best_val_loss = float("inf")
best_model = model
history = {
    'train_loss': [],
    'train_acc': [],
    'valid_loss': [],
    'valid_acc': []
}
""" train_loss_compute = SimpleLossCompute(criterion,
                                       opt=optimizer,
                                       scheduler=scheduler)
valid_loss_compute = SimpleLossCompute(criterion, opt=None, scheduler=None)
"""
epoch = 0
model_name = "%s%s.pt" % (CONFIG["SAVE_DIR"], args.model)

In [17]:
print("\nTraining on %d GPU(s) with batch_size %d for %d epochs" %
      (args.gpus, args.batch_size, args.epochs))
sys.stdout.flush()

best_val_loss = float("inf")
best_model = model
history = {
    'train_loss': [],
    'train_acc': [],
    'valid_loss': [],
    'valid_acc': []
}
""" train_loss_compute = SimpleLossCompute(criterion,
                                       opt=optimizer,
                                       scheduler=scheduler)
valid_loss_compute = SimpleLossCompute(criterion, opt=None, scheduler=None)
"""
epoch = 0
model_name = "%s%s.pt" % (CONFIG["SAVE_DIR"], args.model)
""" totalfreq = float(sum(train_ds.train_freq.values()))
print(
    sorted(((i2w[l], f / totalfreq)
            for l, f in train_ds.train_freq.most_common()),
           key=lambda x: -x[1]))
"""
# Run training and validation for args.epochs epochs
lr = args.lr
for epoch in range(1, args.epochs + 1):
    epoch_start_time = time.time()
    print(f'Epoch: {epoch:02}')
    print('\tTrain: ', end='')
    train_loss, train_acc = train(
        train_dl,
        model,
        criterion,
        list(range(args.gpus)),
        DEVICE,
        optimizer,
        scheduler=scheduler,
        seq2seq=not classify,
        pad_idx=vocab[CONFIG["pad_token"]] if not classify else -1)
    for param_group in optimizer.param_groups:
        if 'lr' in param_group:
            print(' | lr {:1.2E}'.format(param_group['lr']))
            break
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    print('\tValid: ', end='')
    with torch.no_grad():
        valid_loss, valid_acc = valid(
            valid_dl,
            model,
            criterion,
            DEVICE,
            temperature=args.temp,
            seq2seq=not classify,
            pad_idx=vocab[CONFIG["pad_token"]] if not classify else -1)
    history['valid_loss'].append(valid_loss)
    history['valid_acc'].append(valid_acc)

    # Store best model so far
    if valid_loss < best_val_loss:
        best_model, best_val_loss = model, valid_loss
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model_to_save = best_model.module\
                if hasattr(best_model, 'module') else best_model
            torch.save(model_to_save, model_name)
        sys.stdout.flush()

#         # Additional Info when using cuda
#         if DEVICE.type == 'cuda':
#             print('Memory Usage:')
#             for i in range(args.gpus):
#                 max_alloc = round(
#                     torch.cuda.max_memory_allocated(i) / 1024**3, 1)
#                 cached = round(torch.cuda.memory_cached(i) / 1024**3, 1)
#                 print(f'GPU: {i} Allocated: {max_alloc}G Cached: {cached}G')

#         # if epoch > 10 and valid_loss > max(history['valid_loss'][-3:]):
#         #     lr /= 2.
#         #     for param_group in optimizer.param_groups:
#         #         param_group['lr'] = lr


Training on 0 GPU(s) with batch_size 48 for 5 epochs
Epoch: 01
	Train: loss 5.111 | perplexity 165.88 | ms/batch 490.21 | lr 1.00E-04
	Valid: loss 4.674 | perplexity 107.13
Epoch: 02
	Train: loss 4.589 | perplexity 98.38 | ms/batch 502.43 | lr 1.00E-04
	Valid: loss 4.525 | perplexity 92.26
Epoch: 03
	Train: loss 4.465 | perplexity 86.90 | ms/batch 368.25 | lr 1.00E-04
	Valid: loss 4.456 | perplexity 86.17
Epoch: 04
	Train: loss 4.392 | perplexity 80.78 | ms/batch 498.86 | lr 1.00E-04
	Valid: loss 4.406 | perplexity 81.95
Epoch: 05
	Train: loss 4.330 | perplexity 75.94 | ms/batch 165.94 | lr 1.00E-04
	Valid: loss 4.362 | perplexity 78.44


#### Post-processing

In [18]:
device = DEVICE
print("Evaluating predictions on test set")
# Load best model
best_model = torch.load(model_name)
if args.gpus:
    best_model.to(device)

softmax = nn.Softmax(dim=1)

Evaluating predictions on test set


In [19]:
def translate_neural_signal(model, data_iterator, data_set_len, vocab_len):

    valid_bi_preds = torch.zeros(data_set_len, 3, vocab_len)
    all_trg_y = torch.zeros(data_set_len, 3, dtype=torch.int32)

    # Calculate all predictions on test set
    with torch.no_grad():
        model.eval()

        for enum, batch in enumerate(data_iterator):

            src = batch[0].to(device) 
            trg_y = batch[2].long().to(device)
            trg_pos_mask= batch[3].to(device).squeeze() 
            trg_pad_mask = batch[4].to(device)

            all_trg_y[enum*args.batch_size:(enum+1)*args.batch_size, :] = trg_y

            memory = model.encode(src)
            y = torch.zeros(src.size(0), 1, len(vocab)).long().to(device)
            y[:, :, vocab[CONFIG["begin_token"]]] = 1

            bi_out = torch.zeros(len(batch[0]), trg_y.shape[1], len(vocab))
            for i in range(trg_y.size(1)):
                out = model.decode(memory, y,
                                   trg_pos_mask[:y.size(1), :y.size(1)],
                                   trg_pad_mask[:, :y.size(1)])[:, -1, :]
                out = softmax(out / args.temp)
                bi_out[:, i, :] = out
                temp = torch.zeros(src.size(0), vocab_len).long().to(device)
                temp = temp.scatter_(1,
                                     torch.argmax(out, dim=1).unsqueeze(-1), 1)
                y = torch.cat([y, temp.unsqueeze(1)], dim=1)
            valid_bi_preds[enum*args.batch_size:(enum+1)*args.batch_size, :, :] = bi_out

        topk_preds = torch.topk(valid_bi_preds, 10).indices
        topk_preds = topk_preds.view(data_set_len, -1)
        all_preds = valid_bi_preds.view(data_set_len, -1)

    return all_trg_y, topk_preds, all_preds

In [20]:
train_size = len(train_ds)
valid_size = len(valid_ds)
vocab_len = len(vocab)

valid_all_trg_y, valid_topk_preds, valid_all_preds = translate_neural_signal(best_model, valid_dl, valid_size, vocab_len)
train_all_trg_y, train_topk_preds, train_all_preds = translate_neural_signal(best_model, train_dl, train_size, vocab_len)

In [129]:
def calc_rank(x, string):
    word = x[string]
    string = string + '_0*'
    word_preds = x.filter(regex=string)
    
    try:
        rank = np.where(word == word_preds)[0][0]
    except IndexError:
        rank = pd.NA
    
    return rank


def fill_topk_cols(x, string):
    rank = x['_'.join([string, 'rank'])]
    
    if pd.isna(rank):
        abc = [0, 0, 0]
    elif rank == 0:
        abc = [1, 0, 0]
    elif rank < 5:
        abc = [0, 1, 0]
    elif rank < 10:
        abc =  [0, 0, 1]
    else:
        abc = [0, 0, 0]

    return abc
       

def create_excel_preds(targets, top_predictions, i2w):
    df = pd.DataFrame(targets.numpy(), columns = ['word1', 'word2', 'word3'])
    df = df.drop(columns = ['word3'])
    pred_col_names = ['_'.join([word, str(i).zfill(2)]) for word in ['word1', 'word2']
                      for i in range(1, 11)]
    df[pred_col_names] = pd.DataFrame(top_predictions.numpy()[:, :20])
    top_col_names = ['_'.join([word, 't' + str(i)]) for word in ['word1', 'word2']
                    for i in [1, 5, 10]]
    
    df['word1_rank'] = df.apply(calc_rank, axis=1, args=('word1',))
    df['word2_rank'] = df.apply(calc_rank, axis=1, args=('word2',))
    
    df[['word1_top1', 'word1_top5', 'word1_top10']] = pd.DataFrame(df.apply(fill_topk_cols, axis=1, args=('word1', )).tolist())
    df[['word2_top1', 'word2_top5', 'word2_top10']] = pd.DataFrame(df.apply(fill_topk_cols, axis=1, args=('word2', )).tolist())

    df[pred_col_names] = df[pred_col_names].replace(i2w)
    df['word1'] = df['word1'].replace(i2w)
    df['word2'] = df['word2'].replace(i2w)
    
    return df

In [130]:
valid_preds_df = create_excel_preds(valid_all_trg_y, valid_topk_preds, i2w)
train_preds_df = create_excel_preds(train_all_trg_y, train_topk_preds, i2w)

In [131]:
valid_preds_df.to_excel(os.path.join(CONFIG["SAVE_DIR"], 'valid_set.xlsx'), index=False)
train_preds_df.to_excel(os.path.join(CONFIG["SAVE_DIR"], 'train_set.xlsx'), index=False)

In [111]:
valid_preds_df['word1']

0       35
1      197
2      322
3      173
4      204
      ... 
277    294
278    197
279    181
280    171
281    218
Name: word1, Length: 282, dtype: int32

In [109]:
valid_all_preds[:, 0:338]

tensor([[0.0010, 0.2417, 0.0017,  ..., 0.0020, 0.0026, 0.0009],
        [0.0011, 0.2523, 0.0016,  ..., 0.0018, 0.0025, 0.0011],
        [0.0011, 0.2459, 0.0017,  ..., 0.0021, 0.0024, 0.0011],
        ...,
        [0.0011, 0.2586, 0.0016,  ..., 0.0018, 0.0024, 0.0010],
        [0.0011, 0.2667, 0.0015,  ..., 0.0019, 0.0024, 0.0010],
        [0.0012, 0.2788, 0.0014,  ..., 0.0018, 0.0022, 0.0012]])

In [1]:
from sklearn.metrics import roc_auc_score


def get_roc_auc_ovo_scores(data, all_preds, string, i2w):
    y_test = data['word1']
    vocab_len = len(i2w)
    
    if string == 'word1':
        softmax_range = list(range(0, vocab_len))
    elif string == 'word2':
        softmax_range = list(range(len(i2w), 2 * len(i2w)))
        
    y_prob = all_preds[:, softmax_range]
    
    labels = list(i2w.keys())
    
    macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                  average="macro", labels=labels)
    
    weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                     average="weighted", labels=labels)
    print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
      "(weighted by prevalence)"
      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))



def get_roc_auc_ovr_scores(data, all_preds, string, i2w):
    y_test = data['word1']
    vocab_len = len(i2w)
    
    if string == 'word1':
        softmax_range = list(range(0, vocab_len))
    elif string == 'word2':
        softmax_range = list(range(len(i2w), 2 * len(i2w)))
        
    y_prob = all_preds[:, softmax_range]
    
    labels = list(i2w.keys())
    
    macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                  average="micro", labels=labels)
    weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                     average="weighted", labels=labels)
    print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
      "(weighted by prevalence)"
      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))

In [120]:
get_roc_auc_ovo_scores(train_preds_df, train_all_preds, 'word1', i2w)
get_roc_auc_ovo_scores(train_preds_df, train_all_preds, 'word2', i2w)
get_roc_auc_ovo_scores(valid_preds_df, valid_all_preds, 'word1', i2w)
get_roc_auc_ovo_scores(valid_preds_df, valid_all_preds, 'word2', i2w)

One-vs-One ROC AUC scores:
0.552442 (macro),
0.545307 (weighted by prevalence)
One-vs-One ROC AUC scores:
0.552245 (macro),
0.544587 (weighted by prevalence)
One-vs-One ROC AUC scores:
0.538355 (macro),
0.529472 (weighted by prevalence)
One-vs-One ROC AUC scores:
0.541047 (macro),
0.530183 (weighted by prevalence)


In [121]:
get_roc_auc_ovr_scores(train_preds_df, train_all_preds, 'word1', i2w)
get_roc_auc_ovr_scores(train_preds_df, train_all_preds, 'word2', i2w)
get_roc_auc_ovr_scores(valid_preds_df, valid_all_preds, 'word1', i2w)
get_roc_auc_ovr_scores(valid_preds_df, valid_all_preds, 'word2', i2w)

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [79]:
from sklearn.metrics import confusion_matrix, roc_auc_score

# https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix
def get_confusion_matrix(data, string, i2w):
    y_true = data[string]
    y_pred = data['_'.join([string, '01'])]
    labels = list(i2w.keys())
    data_cm = confusion_matrix(valid_preds_df['word2'],
                                          valid_preds_df['word2_01'],
                                          labels=labels)
    df_cm = pd.DataFrame(data_cm, labels, labels)
    return data_cm

In [80]:
valid_w1_cm = plot_confusion_matrix(valid_preds_df, 'word1', i2w)
valid_w2_cm = plot_confusion_matrix(valid_preds_df, 'word2', i2w)
train_w1_cm = plot_confusion_matrix(train_preds_df, 'word1', i2w)
train_w2_cm = plot_confusion_matrix(train_preds_df, 'word2', i2w)

In [None]:
# Evaluate top-k
print("Evaluating top-k")
sys.stdout.flush()
res = evaluate_topk(all_preds,
                    all_labs,
                    i2w,
                    train_freq,
                    CONFIG["SAVE_DIR"],
                    suffix='-val',
                    min_train=args.vocab_min_freq,
                    tokens_to_remove=markers)

In [None]:
# Evaluate ROC-AUC
print("Evaluating ROC-AUC")
sys.stdout.flush()
res.update(
    evaluate_roc(all_preds,
                 categorical,
                 i2w,
                 train_freq,
                 CONFIG["SAVE_DIR"],
                 do_plot=not args.no_plot,
                 min_train=args.vocab_min_freq,
                 tokens_to_remove=markers))
pprint(res.items())
print("Saving results")
with open(CONFIG["SAVE_DIR"] + "results.json", "w") as fp:
    json.dump(res, fp, indent=4)