### Initial Setup

In [None]:
!pip install pylangacq
!pip install transformers
!pip install sentencepiece
!pip install nlpaug
!pip install datasets

In [None]:
import torch
import random
import numpy as np
import os
import re
import pylangacq
import tqdm
import gc
import copy
import csv
from torch.nn import Softmax

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
import nlpaug.augmenter.word as naw
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from datasets import load_metric

In [None]:
# Set your working directory to wherever you have your data saved

In [None]:
from setup import torch_setup, set_seed
from text_preprocessing import data_to_str, test_data_to_str
from text_augmentation import mixup

### Creating a Dataset

In [None]:
class Dataset_BERT(Dataset):

    def __init__(self, ids, labels):
        self.x_train = ids
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

### Eval Functions

In [None]:
def model_performance(output, target):
    """
    Returns accuracy per batch, 
    i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    cf = confusion_matrix(target, output, labels=[0,1])
    # print(cf)
    tp, fp, fn, tn = cf[1][1], cf[0][1], cf[1][0], cf[0][0]

    return tp, fp, fn, tn

In [None]:
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_correct = 0
    epoch_incorrect = 0
    tps = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            ids, labels = batch
            ids, labels = ids.to(DEVICE), labels.to(DEVICE)
            out = model(ids, labels = labels)

            loss, preds = out[0], out[1]
            no_observations += labels.shape[0]
            
            tp, fp, fn, tn = model_performance(
                np.argmax(preds.detach().cpu().numpy(), axis=1), 
                labels.cpu().numpy())

            epoch_loss += loss.item()*labels.shape[0]
            tps += tp
            epoch_correct += (tp+tn)
            epoch_incorrect += (fn+fp)
            pred_all.extend(preds.detach())
            trg_all.extend(labels.detach())
    acc = epoch_correct / no_observations
    f1 = tps / (tps + 0.5*(epoch_incorrect))
    loss = epoch_loss / no_observations

    return loss, acc, f1

### Training Function

In [None]:
def train_model(model, tr_dataloader, vl_dataloader, epochs, optimizer, scheduler=None, val='val'):

    train_acc = []
    val_acc = []
    val_f1 = []
    for ep in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_correct = 0
        no_observations = 0

        for i, batch in enumerate(tr_dataloader):

            ids, labels = batch
            ids, labels = ids.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            out = model(ids, labels=labels)

            del ids
            gc.collect()
            torch.cuda.empty_cache()
            
            loss = out[0]
            loss.backward()
            optimizer.step()
    
            if scheduler:
                scheduler.step()
            optimizer.zero_grad()

            preds = out[1].detach().cpu().numpy()
            tp, fp, fn, tn = model_performance(np.argmax(preds, axis=1), labels.cpu().numpy())
            epoch_loss += loss.item()*labels.shape[0]
            no_observations += labels.shape[0]
            epoch_correct += (tp+tn)

        # eval
        valid_loss, valid_acc, valid_f1 = eval(vl_dataloader, model)
        epoch_loss, epoch_acc = epoch_loss / no_observations, epoch_correct / no_observations
        train_acc.append(epoch_acc)
        val_acc.append(valid_acc)
        val_f1.append(valid_f1)

        if val == 'val':
            print(f'| Epoch: {ep+1} | Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f} | \
            Val. Loss: {valid_loss:.4f} | Val. Accuracy: {valid_acc:.4f} |')
        elif val == 'test':
            print(f'| Epoch: {ep+1} | Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f} | \
            Test Loss: {valid_loss:.4f} | Test Accuracy: {valid_acc:.4f} |')

    performance = dict()
    performance['train'] = train_acc
    performance[val] = [val_acc, val_f1]

    return performance

### CV Setup

In [None]:
### Train loop

def train_bert_cv(data, train_labels, bert_type='base', augment=True, max_len=256, 
                  warmup=0.12, lr=2e-05, num_epochs=8, batch_size=4, seed=1, 
                  aug_data_size=1):
    labels = copy.deepcopy(train_labels)

    # define some constants
    N_SPLITS=10
    DEVICE = torch_setup()
    special_tokens_dict = {'additional_special_tokens': ['...']}

    # Set random seed & set device
    set_seed(seed)


    cv_results, acc_ep_val, acc_ep_tr = dict(), dict(), dict()
    cv_results_t = dict()
    results, results_f1, test_results, test_results_f1 = [], [], [], []
    
    for i in range(num_epochs):
        acc_ep_val[i] = [] 
        acc_ep_tr[i] = []

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)

    if bert_type == "base":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif bert_type =="large":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    tokenizer.add_special_tokens(special_tokens_dict)

    for i, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        gc.collect()
        torch.cuda.empty_cache()

        train_data, val_data = [dataset[i] for i in train_idx], [dataset[i] for i in val_idx]
        train_labels, val_labels = [labels[i] for i in train_idx], [labels[i] for i in val_idx]

        if augment == True:
            assert aug_data_size == len(aug_dataset)//len(labels)
            for i in range(aug_data_size):

                aug_data = [aug_dataset[idx+(i*108)] for idx in train_idx if type(aug_dataset[idx]) == str]
                aug_labels = [labels[idx] for idx in train_idx if type(aug_dataset[idx]) == str]
            
                train_data.extend(aug_data)
                train_labels.extend(aug_labels)

        train_data, train_labels = shuffle(train_data, train_labels, random_state=seed)
        val_data, val_labels = shuffle(val_data, val_labels, random_state=seed)

        train_embedding = tokenizer(train_data, max_length = max_len, 
                                    padding='max_length', truncation=True, 
                                    return_tensors='pt')
        val_embedding = tokenizer(val_data, max_length = max_len, 
                                    padding='max_length', truncation=True, 
                                    return_tensors='pt')
        test_embedding = tokenizer(test_data, max_length = max_len, 
                                    padding='max_length', truncation=True, 
                                    return_tensors='pt')

        train_dataset = Dataset_BERT(train_embedding["input_ids"], 
                                    torch.tensor(train_labels))

        val_dataset = Dataset_BERT(val_embedding["input_ids"], 
                                torch.tensor(val_labels))
        
        test_dataset = Dataset_BERT(test_embedding["input_ids"], 
                                    torch.tensor(test_labels))

        gc.collect()
        torch.cuda.empty_cache()

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

        if bert_type == "base":
            bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                                    num_labels = 2)
        elif bert_type =="large":
            bert_model = BertForSequenceClassification.from_pretrained("bert-large-uncased",
                                                                    num_labels = 2)

        bert_model.to(DEVICE)

        optimizer = AdamW(bert_model.parameters(), lr=lr)

        total_steps = len(train_dataloader) * num_epochs
        wu = warmup
        wu_steps = int(total_steps * wu)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = wu_steps,
                                                    num_training_steps = total_steps)

        torch.cuda.empty_cache()
        performances = train_model(bert_model, train_dataloader, val_dataloader, num_epochs, optimizer, scheduler)
        test_loss, test_acc, test_f1 = eval(test_dataloader, bert_model)

        cv_results[i] = performances

        results.append(performances["val"][0][-1])
        results_f1.append(performances["val"][1][-1])
        test_results.append(test_acc)
        test_results_f1.append(test_f1)

        for i in range(num_epochs):
            acc_ep_val[i].append(performances["val"][0][i])
            acc_ep_tr[i].append(performances["train"][i])
    
    for i in range(num_epochs):
        val_accuracy = sum(acc_ep_val[i])/len(acc_ep_val[i])
        train_accuracy = sum(acc_ep_tr[i])/len(acc_ep_tr[i])

    return results

In [None]:
### Train loop

def train_bert_test(train_data, train_labels, test_data, test_labels, num_epochs=8, 
                    batch_size=4, bert_type='base', max_len=256, lr=2e-05, 
                    warmup=0.12, augment=True, save_models=False, seed=1, 
                    aug_data_size=1):
    
    tr_labels = copy.deepcopy(train_labels)
    tr_data = copy.deepcopy(train_data)

    special_tokens_dict = {'additional_special_tokens': ['...']}

    # Set random seed & set device
    set_seed(seed)

    cv_results, acc_ep_tr = dict(), dict()
    cv_results_t = dict()
    results, results_f1, test_results, test_results_f1 = [], [], [], []
    
    for i in range(num_epochs):
        acc_ep_tr[i] = []

    if bert_type == "base":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    elif bert_type =="large":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    tokenizer.add_special_tokens(special_tokens_dict)

    if augment:
        print(len(aug_dataset))
        print(len(tr_labels))
        print(len(aug_dataset)//len(tr_labels))
        assert aug_data_size == len(aug_dataset)//len(tr_labels)
        for i in range(aug_data_size):
            # print(i)
            # print(len(labels))
            # print(len(aug_dataset))
            aug_data = [aug_dataset[idx+(i*108)] for idx in list(range(108)) if type(aug_dataset[idx]) == str]
            aug_labels = [tr_labels[idx] for idx in list(range(len(aug_dataset)//aug_data_size)) if type(aug_dataset[idx]) == str]
            # print(len(aug_data))
            # print(len(aug_labels))

            tr_data.extend(aug_data)
            tr_labels.extend(aug_labels)

    train_data, train_labels = shuffle(tr_data, tr_labels, random_state=seed)

    train_embedding = tokenizer(train_data, max_length = max_len, 
                                padding='max_length', truncation=True, 
                                return_tensors='pt')

    test_embedding = tokenizer(test_data, max_length = max_len, 
                                padding='max_length', truncation=True, 
                                return_tensors='pt')

    train_dataset = Dataset_BERT(train_embedding["input_ids"], 
                                torch.tensor(train_labels))
    
    test_dataset = Dataset_BERT(test_embedding["input_ids"], 
                                torch.tensor(test_labels))

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    if bert_type == "base":
        bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                                num_labels = 2)
    elif bert_type =="large":
        bert_model = BertForSequenceClassification.from_pretrained("bert-large-uncased",
                                                                num_labels = 2)

    bert_model.to(DEVICE)

    optimizer = AdamW(bert_model.parameters(), lr=lr)

    total_steps = len(train_dataloader) * num_epochs
    wu = warmup
    wu_steps = int(total_steps * wu)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = wu_steps,
                                                num_training_steps = total_steps)

    performances = train_model(bert_model, train_dataloader, test_dataloader, num_epochs, optimizer, scheduler)
    test_loss, test_acc, test_f1 = eval(test_dataloader, bert_model)

    if save_models:
        torch.save(bert_model, f'bert_250821_{seed}.h5')

    cv_results[i] = performances

    test_results.append(test_acc)
    test_results_f1.append(test_f1)

    for i in range(num_epochs):
        acc_ep_tr[i].append(performances["train"][i])

    for i in range(num_epochs):
        train_accuracy = sum(acc_ep_tr[i])/len(acc_ep_tr[i])
    
    return bert_model

### Eval

In [None]:
def eval_text(dict_by_id, model, labels, csv_filename, result_dict=None, 
              save=False):
    """
    Evaluating model performance on the dev set
    """
    if result_dict == None:
        result_dict = dict()
    model.eval()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    with torch.no_grad():
        for i, idx in enumerate(dict_by_id.keys()):
            embedding = tokenizer(dict_by_id[idx], max_length = 256, 
                                  padding='max_length', truncation=True, 
                                  return_tensors='pt')
            label = torch.tensor(labels[i]).to(DEVICE)
            ids = embedding['input_ids'].to(DEVICE)
            ids.unsqueeze(0)
            out = model(ids, labels = label)
            loss, preds = out[0], out[1]

            result_dict[idx] = Softmax()(preds.squeeze().detach().cpu()).numpy()

            # print(f"{idx} PREDICTION: {Softmax()(preds.detach().cpu()).numpy()} LABEL: {labels[i]}")
            
    if save == True:
        with open(csv_filename, 'w') as f:
            w = csv.DictWriter(f, result_dict.keys())
            w.writeheader()
            w.writerow(result_dict)

### Main

In [None]:
DEVICE = torch_setup()

# Set path to folder txt files with ids by cls
PATH = 'path'

# Load the ids of AD subjects into a list
with open(f'{PATH}/audio_filenames_dementia.txt', "r") as clf:
    lines = clf.readlines()
ids_ad = [re.sub('\n', '', line) for line in lines]

# Load the ids of Control subjects into a list
with open(f'{PATH}/audio_filenames_control.txt', "r") as clf:
    lines = clf.readlines()
ids_hc = [re.sub('\n', '', line) for line in lines]

path_ad = 'path_to_ad_data'
path_hc = 'path_to_hc_data'
test_path = 'path_to_test_data'
test_id_path = 'path_to_a_txt_file_with_test_sample_labels'

data_ad, labels_ad, aug_dataset_ad = data_to_str(ids_ad, path_ad, AD_flag=1,
                                                 augment=False) 
data_hc, labels_hc, aug_dataset_hc = data_to_str(ids_hc, path_hc, AD_flag=0,
                                                 augment=False)

dataset, labels, aug_dataset = [], [], []

dataset.extend(data_ad)
dataset.extend(data_hc)

aug_dataset.extend(aug_dataset_ad)
aug_dataset.extend(aug_dataset_hc)

aug_labels = []
aug_labels.extend(labels_ad)
aug_labels.extend(labels_hc)

labels.extend(labels_ad)
labels.extend(labels_hc)

test_data, test_labels, dict_by_id = test_data_to_str(test_id_path, test_path, 
                                                      return_dict=True)

Torch version: 1.10.0+cu111, CUDA: 11.1


In [None]:
for i in range(1, 6):
    print("SEED ", i)
    bert_model = train_bert_test(dataset, labels, test_data, test_labels, 
                                 aug_data_size=0, seed=i)
    