# Development of brain-fine-tuning code

## TODO



In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

# Pre-processing for Harry Potter dataset

## ONLY FOR SUBJECT 1

In [33]:
import numpy as np
from scipy.io import loadmat

harry_potter = loadmat('/home/ubuntu/NLP-brain-biased-robustness/data/harry_potter_brain/subject_1.mat')


words = []
for i in range(5176):
    word = harry_potter['words'][0][i][0][0][0][0]
    words.append(word)

word_times = []
for i in range(5176):
    word_time = harry_potter['words'][0][i][1][0][0]
    word_times.append(word_time)

tr_times = []
for i in range(1351):
    tr_time = harry_potter['time'][i,0]
    tr_times.append(tr_time)

dont_include_indices = [i for i in range(15)] + [i for i in range(335,355)] + [i for i in range(687,707)] + [i for i in range(966,986)] + [i for i in range(1346,1351)]

X_fmri = harry_potter['data']

useful_X_fmri = np.delete(X_fmri, dont_include_indices,axis=0)

tr_times_arr = np.asarray(tr_times)

useful_tr_times = np.delete(tr_times_arr, dont_include_indices)

sentences = [[]]*1271
for idx, useful_tr_time in enumerate(useful_tr_times):
    sentence= []
    for word, word_time in zip(words,word_times):
        if useful_tr_time - 10 <= word_time <= useful_tr_time:
            sentence.append(word)
    sentences[idx] = sentence   
    

actual_sentences = ['']*1271
for idx, sentence in enumerate(sentences):
    for word in sentence:
        actual_sentences[idx] = actual_sentences[idx] + word + ' '
        

fmri = torch.as_tensor(useful_X_fmri)
truth_fmri = fmri[:5,:]
truth_fmri.shape


from torch.utils.data import DataLoader

dataset = []
for i in range(1271):
    dataset.append((actual_sentences[i], fmri[i,:]))
    
#TRAIN TEST SPLIT HAS OVERLAP IN WORDS AND IN BRAIN STATE
n_rows = len(dataset)
train_dataset = dataset[:int(.7*n_rows)]
val_dataset = dataset[int(.8*n_rows):]

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

# Pre-processing for parcellated NSD (26 dimensional)

In [6]:
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import h5py
from pycocotools.coco import COCO
import time
import csv
import torch

data_path = '/home/ubuntu/NLP-brain-biased-robustness/NSD/'

coco3 = COCO(data_path+'annotations/captions_train2017.json')
coco4 = COCO(data_path+'annotations/captions_val2017.json')

def load_csv(csv_file):
    file = open(csv_file)
    csvreader = csv.reader(file)
    header = next(csvreader)
    rows = []
    for row in csvreader:
        rows.append(row)
    file.close()
    return rows

nsd_to_coco = load_csv(data_path+'nsd_stim_info_merged.csv')
exp_design = scipy.io.loadmat(data_path+'nsd_expdesign.mat')
ordering = exp_design['masterordering'].flatten() - 1 #fix indexing

data_size = 22500 #trials[subject-1] #can use more than 22500 trials if seems promising
ordering_data = ordering[:data_size]
subjectim = exp_design['subjectim'] - 1

def index_to_captions(my_index, subject):
    index = ordering_data[my_index]
    nsd_id = subjectim[subject-1,index]
    coco_id = nsd_to_coco[nsd_id][1]
    if int(nsd_id) < 2950:
        annotation_ids = coco4.getAnnIds(int(coco_id))
        annotations = coco4.loadAnns(annotation_ids)
    else:
        annotation_ids = coco3.getAnnIds(int(coco_id))
        annotations = coco3.loadAnns(annotation_ids)
    captions = [item['caption'] for item in annotations]
    return captions

NSD_fmri_parcellated = np.empty((22500,23,8))
for subject in range(8):
    X = scipy.io.loadmat(data_path+'X'+str(subject+1)+'.mat')
    NSD_fmri_parcellated[:,:,subject] = X['X']
    

dataset = []
for subject in range(8):
    for my_index in range(22500):
        descriptions = index_to_captions(my_index, subject+1)
        brain_vec = NSD_fmri_parcellated[my_index,:,subject]
        for description in descriptions:
            example = (description, brain_vec)
            dataset.append(example)

#dataset is a list of ('sentence',23-dim numpy brain vector)

loading annotations into memory...
Done (t=1.05s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


# Model - BrainBiasedBERT

In [23]:
class BrainBiasedBERT(nn.Module):
    def __init__(self, num_voxels=37913):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.linear = nn.Linear(768,num_voxels)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    def forward(self, x):
        embeddings = self.tokenizer(x, return_tensors='pt', padding=True)
        embeddings.to(self.device)
        representations = self.bert(**embeddings).last_hidden_state
        cls_representation = representations[:,0,:]
        pred_fmri = self.linear(cls_representation)
        return pred_fmri

In [36]:
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import wandb

wandb.init(project="preliminary results just in case", entity="nlp-brain-biased-robustness")

wandb.config = {
  "learning_rate": 5e-5,
  "epochs": 15,
  "batch_size": 8
}

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.eval()
    with torch.no_grad():
        test_losses = []
        for batch in dataloader:
            preds = model(list(batch[0]))
            labels = batch[1].to(device)
            test_loss = loss_function(preds, labels.float())
            test_losses.append(test_loss)

    return torch.mean(torch.as_tensor(test_losses)) 

    
def train(model, dataloader, num_epochs=15): 
    last_val_loss = 9223372036854775807
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    progress_bar = tqdm(range(num_training_steps))

    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            preds = model(list(batch[0]))
            labels = batch[1].to(device)
            loss = loss_function(preds, labels.float()) #replace .loss
            loss.backward()
            
            wandb.log({"loss": loss})
            wandb.watch(model)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        
        val_loss = evaluate(model, test_dataloader)
        wandb.log({"training loss": loss})
        wandb.log({"val loss": val_loss})
        if val_loss > last_val_loss:
            print('Stopped early')
            torch.save(model.state_dict(), 'fine_tuned_model')
            break
        last_val_loss = val_loss
        

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,▅▄▅▇▆█▅▆▆█▄▆▆▆▆▅▆▅▅▆▆▃▄▅▅▄▄▅▃▅▅▇▄▁▄▃▄▂▄▄

0,1
loss,309724.34375


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Exception in thread Thread-11:
Traceback (most recent call last):
  File "/home/ubuntu/environments/my_env/lib/python3.8/site-packages/wandb/apis/normalize.py", line 22, in wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/environments/my_env/lib/python3.8/site-packages/wandb/sdk/internal/internal_api.py", line 1434, in upload_urls
    raise CommError(f"Run does not exist {entity}/{project}/{run_id}.")
wandb.errors.CommError: Run does not exist nlp-brain-biased-robustness/preliminary results just in case/1fp5cl53.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/ubuntu/environments/my_env/lib/python3.8/site-packages/wandb/filesync/upload_job.py", line 56, in run
    success = self.push()
  File "/home/ubuntu/environments/my_env/lib/python3.8/site-packages/wandb/filesync/upload_job.py", line 107, in push
    _, upload_he

In [37]:
model = BrainBiasedBERT()
train(model, train_dataloader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1680 [00:00<?, ?it/s]

wandb: ERROR Summary data exceeds maximum size of 10.4MB. Dropping it.


In [38]:
torch.save(model.state_dict(), 'fine_tuned_model')