### Semantic features as word embeddings

In [1]:
import numpy as np
import os
import torch
import json
from tqdm import tqdm
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor

from tqdm import tqdm
from numpy import dot
from numpy.linalg import norm
from sklearn.model_selection import LeavePOut
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def cosim(a, b):
    return 1 - spatial.distance.cosine(a, b)

### Semantic features

In [3]:
min_len_line = 5
N_SEMANTIC_FEATURES = 25
semantic_features = {}

def dump_mitchell_web_semantic_features(raw_file = os.path.join("data","mitchell_semantic_raw.txt")):
    with open(raw_file, "r") as datafile:
        lines = datafile.readlines()
        word = None

        for line in lines:

            # Skip empty
            if len(line) >= min_len_line:

                # New feature
                if "Features for" in line:

                    # Discard invalid ones (once fully parsed)
                    if word and len(semantic_features[word]['features']) < N_SEMANTIC_FEATURES: del semantic_features[word] 
                        
                    word = line.split("<a name=\"")[1].split("\"")[0]
                    semantic_features[word] = { "features": [], "values": []}

                elif word:
                    feature_name = line.split("(")[0]
                    val = float(line.split("(")[1].split(")")[0])
                    semantic_features[word]["features"].append(feature_name)
                    semantic_features[word]["values"].append(val)

    # Save to file
    #with open(os.path.join('data', 'mitchell_semantic_features.json'), 'w') as fp:
    #    json.dump(semantic_features, fp)

    return semantic_features


def load_sorted_semantic_features(file = os.path.join("data","mitchell_semantic_features.json")):
    with open(file) as f:
        semantic_features = json.load(f)
        for word in semantic_features.keys():
            # Sort all features
            sorted_features = sorted(enumerate(semantic_features[word]["features"]), key=lambda x:x[1])
            sorted_indices = [i[0] for i in sorted_features]
            sorted_values = [semantic_features[word]["values"][i] for i in sorted_indices]

            # Re-store them
            semantic_features[word]["features"] = [x[1] for x in sorted_features]
            semantic_features[word]["values"] = sorted_values
            break

    return semantic_features
            

### fMRI data

In [4]:
def get_mitchell_original_data(subject = 1, random_voxels = None):
    mdata = scipy.io.loadmat(os.path.join("data", "mitchell", f"data-science-P{subject}.mat"))
    subject_data = {}

    # 6 x 60 trials
    for i in range(mdata["data"][:].shape[0]):
        cond, cond_number, word, word_number, epoch = [x[0] for x in mdata["info"][0][i]]

        # Set trial data
        if epoch[0] not in subject_data: subject_data[epoch[0]] = {}

        if random_voxels:
            random_voxels_idx = np.random.choice(mdata["data"][i][0][0].shape[0], random_voxels)
            subject_data[epoch[0]][word] = mdata["data"][i][0][0][random_voxels_idx]
        else: subject_data[epoch[0]][word] = mdata["data"][i][0][0]

    return subject_data

### Voxel selection methods

In [5]:
def r2_best_voxels(scores, K, threshold = 0.2):
    scores = np.array(scores)
    r2_selected_voxels = np.where(scores > threshold)[0]
    return np.array(
        sorted( # sort by score, pick first K indices
            list(zip(scores[r2_selected_voxels], r2_selected_voxels)), 
            key = lambda x: x[0]
        )
    )[:K, 1].astype(np.int32).tolist()

In [6]:
def mitchell_stable_voxels(voxel_matrices, train_split_indices, K = 500):

    # Get scores of the voxels
    scores = []
    for vx in voxel_matrices:
        u, s, vh = np.linalg.svd(vx[:, train_split_indices], full_matrices=True) # SVD, take first eigenvalue
        scores.append((s**2)[0])
        
    # indices of the most stable voxels
    return np.argpartition(scores, -K)[-K:]

In [7]:
def get_voxels_matrices(data):
    voxels = data[1]["bell"].shape[0]

    repetitions = []
    for vx in (range(voxels)):
        repetitions.append(np.array(
            [[data[epoch][word][vx] for word in data[epoch].keys()] for epoch in data.keys()]
        ))
    return np.array(repetitions)

In [8]:
%%time
completeFmriData = get_mitchell_original_data(subject=1)
voxels = get_voxels_matrices(completeFmriData)
voxels = mitchell_stable_voxels(voxels, list(range(58)))

CPU times: total: 2.06 s
Wall time: 2.15 s


### Training loop

In [9]:
def leave_2_out_accuracy(y_pred, y_test):

    p1, p2 = y_pred
    i1, i2 = y_test

    pair1_score = cosim(p1, i1) + cosim(p2, i2)
    pair2_score = cosim(p1, i2) + cosim(p2, i1)

    return int(pair1_score > pair2_score)

In [10]:
epoch = 1
subjects = 9
leave2out = LeavePOut(2)
K = 500
VOXELWISE_ACC_THRESHOLD = 0.2

semantic_features = load_sorted_semantic_features()
N_words = len(semantic_features.keys())

In [None]:
accuracies = np.zeros((subjects, 3))

for subject in range(1, subjects+1):

    print(f"**** Subject {subject} ****")

    completeFmriData = get_mitchell_original_data(subject=subject)
    voxel_matrices = get_voxels_matrices(completeFmriData) # pre-compute voxel 6x58 matrices
    
    data = completeFmriData[epoch]
    n_voxels = data["bell"].shape[0]

    # Training
    X = []
    Y = []
    for word in semantic_features.keys():
        if word in data.keys():
            x = np.array(semantic_features[word]["values"])
            y = np.array(data[word])
            X.append(x)
            Y.append(y)

    X = np.array(X)
    Y = np.array(Y)

    # leave 2 out cross validation
    accuracies_r2 = []
    accuracies_most_stable = []
    accuracies_mitchell = []
    progress_bar = tqdm(range(leave2out.get_n_splits(X)))
    
    for i, (train_index, test_index) in enumerate(leave2out.split(X)):
        
        # Train-test split
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
            
        # Early voxel selection
        # this is the most expensive operation! the overhead lies in fetching
        mitchell_voxels = mitchell_stable_voxels(voxel_matrices, train_index, K=K) # extract svds, score voxels, pick best 500
        
        # Predicting & scoring
        predictors = make_pipeline(StandardScaler(), MultiOutputRegressor(LinearRegression(), n_jobs=32))
        predictors.fit(X_train, y_train[:, mitchell_voxels])
        
        # Mitchell stable voxels
        y_pred = predictors.predict(X_test)
        accuracies_mitchell.append(
            leave_2_out_accuracy(y_pred, y_test[:, mitchell_voxels])
        )

        """
        # R2 best voxels
        scores = [r2_score(y_pred[:, i], y_test[:, vx]) for i,vx in enumerate(mitchell_voxels)]
        r2_best = r2_best_voxels(scores, K = K)
        print(r2_best)
        accuracies_r2.append(
            leave_2_out_accuracy(y_pred, y_test[:, r2_best])
        )
        """

        progress_bar.update(1)
            

    # Subject mean accuracies
    accuracies[subject-1] = np.array(
        [np.mean(accuracies_mitchell), 0, 0]
    )
    print(f"Accuracy: {np.mean(accuracies_mitchell):.2f}")
    
    with open('accuracies_semantic2fmri.npy', 'wb') as f:
        np.save(f, accuracies)


| method | subject | voxel selection | accuracy |
|---|---|---|---|
| multiple regressors | 1 | Most stable | 0.60 |
| multiple regressors, no normalization | 1 | Most stable | 0.60 |
| Ridge (not voxel-wise), no normalization | 1 | Most stable | 0.45 |

Note from professor: there is no need to determine the set of best predicted voxels across multiple folds. We just compute the accuracy for each fold and then average.

**Observation**

In this case fitting is way more expensive, as 21k voxels are considered.

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

def best_K_predict(X, indices, predictors):
    predictors = [predictors[idx] for idx in indices]
    y_hat = np.array([predictor.predict(X) for predictor in predictors]) # voxels, sample
    return y_hat.reshape(y_hat.shape[1], y_hat.shape[0]) # sample, voxels

# voxel_indices

y_hat = best_K_predict(X_train, mitchell_voxels, predictors)
y = y_train[:, mitchell_voxels]

RDM_hat = np.matmul(y_hat, np.matrix.transpose(y_hat))

RDM = np.matmul(y, np.matrix.transpose(y))

test_pearson = pearsonr(
    RDM_hat.flatten(),
    RDM.flatten()
)

print(f"Test RDMs R^2:\t{test_pearson}")

plt.subplot(121)
plt.title("Truth")
plt.imshow(RDM)
plt.colorbar()

plt.subplot(122)
plt.title("Prediction")
plt.imshow(RDM_hat)
plt.colorbar()

**Observation**

Here the the voxels from the last cross_val iteration have been selected. For these voxels, the object to object distance matrices have similar patterns.