**Imports and set up**

In [None]:
import numpy as np
import pandas as pd

import transformers
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import notebook_login
 
import torch
from torch.utils.data import Dataset, DataLoader, Subset

from sklearn import metrics
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
#Login with the token to use mentalBERT
notebook_login()

In [None]:
train = pd.read_csv('all-gendered.csv')

In [None]:
#new dataset with sentences and all symptom targets in one list
train['list'] = train[train.columns[26:32]].values.tolist()
new_train = train[['Sentence', 'list']].copy()

In [None]:
model_checkpoint = "mental/mental-bert-base-uncased"

**Dataset and Dataloader**

In [None]:
# hyperparameters as used by the BDI-Sen authors
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 2e-05
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

torch.manual_seed(10)

In [None]:
#class to tokenize the data and create the dataset for the model

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sentence
        self.targets = dataframe.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
#creating the datasets

training_set = CustomDataset(new_train, tokenizer, MAX_LEN)

**Model**

In [None]:
#creating the customized model, by adding a drop out layer and a linear layer to get the final output for the model

class MBERTClass(torch.nn.Module):
    def __init__(self):
        super(MBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_checkpoint)
        self.l2 = torch.nn.Dropout(0.2)
        self.l3 = torch.nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
def trainer(epoch, loader):
    model.train()
    for _,data in enumerate(loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)        
        
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
              }

**Cross-validation**

In [None]:
k=9 #allows for one depressed female in each val split
splits=StratifiedGroupKFold(n_splits=k,shuffle=True,random_state=0)
per_fold_result={}

In [None]:
#stratification is based on depression/control and gender
train['labelgen'] = train[train.columns[[1]+ [25]]].values.tolist()

In [None]:
#function to create new labels
def make_one_label(y):
    y_new = LabelEncoder().fit_transform([''.join(str(l)) for l in y])
    return y_new

In [None]:
#creating new labels based on gender and depression (resulting in 4 labels)
targets = np.array(train['labelgen'].values.tolist())
y_labelgen = make_one_label(targets)

In [None]:
train['Subject'] = train['Subject'].astype(str)

In [None]:
#cross validation

history = {'models' : [], 'val' : [], 'valid_loss': [], 'valid_acc':[], 'valid_f1_micro':[], 'valid_f1_macro':[], 'valid_f1_weighted': []}

for i, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(training_set)), y_labelgen, train['Subject'])):
    
    print("Fold no.{}:".format(i + 1))

    train_data = Subset(training_set, train_idx)
    val_data = Subset(training_set, val_idx)

    train_loader = DataLoader(train_data, **train_params)
    valid_loader = DataLoader(val_data, **val_params)
    
    model = MBERTClass()
    model.to(device)

    optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(EPOCHS):
        trainer(epoch, train_loader) 
    
    history['models'].append(model.state_dict())
    history['val'].append(val_idx)

    model.eval()
    fin_targets=[]
    fin_outputs=[] 
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    outputs, targets = fin_outputs, fin_targets
    outputs = (np.array(outputs) >= 0.5).astype(int)
    targets = [[int(num) for num in sublist] for sublist in targets]
    accuracy = metrics.accuracy_score(targets, outputs)
    print('Valid loss: ', loss, 'Valid accuracy: ', accuracy)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted', zero_division = 0.0)
    history['valid_loss'].append(loss)
    history['valid_acc'].append(accuracy)
    history['valid_f1_micro'].append(f1_score_micro)
    history['valid_f1_macro'].append(f1_score_macro)
    history['valid_f1_weighted'].append(f1_score_weighted)

In [None]:
#saving the history file with the models, val_idx and performance measures

try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle

with open('9-CV-Category-history.p', 'wb') as fp:
    pickle.dump(history, fp, protocol=pickle.HIGHEST_PROTOCOL)