### Semantic features as word embeddings

In [1]:
import numpy as np
import os
import torch
import json
from tqdm import tqdm
import scipy
import pandas as pd

### Semantic features

In [2]:
min_len_line = 5
N_SEMANTIC_FEATURES = 25
semantic_features = {}

def dump_mitchell_web_semantic_features(raw_file = os.path.join("data","mitchell_semantic_raw.txt")):
    with open(raw_file, "r") as datafile:
        lines = datafile.readlines()
        word = None

        for line in lines:

            # Skip empty
            if len(line) >= min_len_line:

                # New feature
                if "Features for" in line:

                    # Discard invalid ones (once fully parsed)
                    if word and len(semantic_features[word]['features']) < N_SEMANTIC_FEATURES: del semantic_features[word] 
                        
                    word = line.split("<a name=\"")[1].split("\"")[0]
                    semantic_features[word] = { "features": [], "values": []}

                elif word:
                    feature_name = line.split("(")[0]
                    val = float(line.split("(")[1].split(")")[0])
                    semantic_features[word]["features"].append(feature_name)
                    semantic_features[word]["values"].append(val)

    # Save to file
    with open(os.path.join('data', 'mitchell_semantic_features.json'), 'w') as fp:
        json.dump(semantic_features, fp)

    return semantic_features

In [3]:
semantic_features = dump_mitchell_web_semantic_features()

### fMRI data

In [4]:
def get_mitchell_original_data(subject = 1, random_voxels = None):
    mdata = scipy.io.loadmat(os.path.join("data", "mitchell", f"mitchell_subject_{subject}.mat"))
    subject_data = {}

    # 6 x 60 trials
    for i in range(mdata["data"][:].shape[0]):
        cond, cond_number, word, word_number, epoch = [x[0] for x in mdata["info"][0][i]]

        # Set trial data
        if epoch[0] not in subject_data: subject_data[epoch[0]] = {}

        if random_voxels:
            random_voxels_idx = np.random.choice(mdata["data"][i][0][0].shape[0], random_voxels)
            subject_data[epoch[0]][word] = mdata["data"][i][0][0][random_voxels_idx]
        else: subject_data[epoch[0]][word] = mdata["data"][i][0][0]

    return subject_data

**Taking the most stable voxels**

Supplement online material Mitchell et al., page 6.

In [5]:
def get_most_stable_voxels(fmriData, train_split_indices, K = 500):
    
    # Get total number of voxels
    voxels = fmriData[1]["bell"].shape[0]

    # Get scores of the voxels
    scores = []
    for vx in range(voxels):

        # Gathering epoch-wise brain activity
        repetitions = []
        for epoch in fmriData.keys():
            # store activations only for THAT vx voxel
            repetitions.append(np.array([fmriData[epoch][word][vx] for word in fmriData[epoch].keys()]))

        # (epochs, words) = (6, 58)
        repetitions = np.array(repetitions)

        # Compute voxel scores ONLY wrt. the training slice of words
        voxel_correlation_score = []
        for i in range(repetitions.shape[0]):
            for j in range(i+1, repetitions.shape[0]):
                # (6, 6) but without triangular down and diagonal = (36 - 6) / 2 = 15 values 
                voxel_correlation_score.append(np.correlate(repetitions[i, train_split_indices], repetitions[j, train_split_indices]))

        voxel_correlation_score = np.array(voxel_correlation_score)
        scores.append(np.mean(voxel_correlation_score))
    
    # indices of the most stable voxels
    return np.argpartition(scores, -K)[-K:]

### K-fold cross validation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm

In [7]:
k_folds = 30
fmriData = get_mitchell_original_data(subject=1)
n_samples = len(semantic_features.keys())

assert n_samples % k_folds == 0, "Number of folds must divide the samples in equal parts. Choose a valid multiplier."

samples_per_fold = (n_samples // k_folds)

n_samples, samples_per_fold 

(60, 2)

In [8]:
for i in range(k_folds):

    # Extracting filtered (most stable voxels) training set
    train_indices = np.array(list(range(samples_per_fold * i)) + list(range((samples_per_fold * (i+1)), n_samples)), dtype=np.int32)
    test_indices = np.array(list(range((samples_per_fold * i), samples_per_fold * (i + 1))), dtype=np.int32)

    K = 500
    voxels_indices = get_most_stable_voxels(fmriData, train_indices, K = K)

    # Filtering the dataset
    filteredfMRIData = {}
    for epoch in fmriData.keys():
        for word, activations in fmriData[epoch].items():
            if word not in filteredfMRIData.keys(): filteredfMRIData[word] = []
            filteredfMRIData[word].append(activations[voxels_indices])
            
    for word in filteredfMRIData.keys():
        filteredfMRIData[word] = np.mean(filteredfMRIData[word], axis=0)

    # Building train set
    X = []
    Y = []

    for word in semantic_features.keys():
        if word in filteredfMRIData.keys():
            X.append(semantic_features[word]["values"])
            Y.append(filteredfMRIData[word])

    X = np.array(X)
    Y = np.array(Y)

    # Train-test split
    X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], Y[train_indices], Y[test_indices]

    # Predicting & scoring
    predictors = [LinearRegression() for i in range(K)]
    scores = []

    # One predictor per voxel
    j = 0
    for model in predictors:
        model.fit(X_train, y_train[:, j])
        scores.append(model.score(X_test, y_test[:, j]))
        j += 1

    scores = np.array(scores)
    topK = 50
    ind = np.argpartition(scores, -topK)[-topK:]
    
    print(f"---- {i+1} fold top {topK} voxels \n min: {np.min(scores[ind])}\n mean: {np.mean(scores[ind])}\n max: {np.max(scores[ind])}\n")



---- 1 fold top 50 voxels 
 min: -4.139328861760555
 mean: -1.106045412417308
 max: 0.9896598896696216

---- 2 fold top 50 voxels 
 min: -7.858748745557463
 mean: -2.9094566417104306
 max: 0.9222728071335513

---- 3 fold top 50 voxels 
 min: 0.5443939784747384
 mean: 0.775093892931009
 max: 0.9998554968393297

---- 4 fold top 50 voxels 
 min: -0.11174333043075246
 mean: 0.4589727537658901
 max: 0.997823818084851

---- 5 fold top 50 voxels 
 min: -1.8887425654771404
 mean: -0.582562914379422
 max: 0.9448297236763774

---- 6 fold top 50 voxels 
 min: -0.2444070584912783
 mean: 0.3613918116015352
 max: 0.9730611753124357

---- 7 fold top 50 voxels 
 min: -0.856418369434184
 mean: 0.04401504271058983
 max: 0.9503492289929538

---- 8 fold top 50 voxels 
 min: -0.23059610179998957
 mean: 0.3322499449886952
 max: 0.9836527907931503

---- 9 fold top 50 voxels 
 min: 0.007159088992719331
 mean: 0.3808042757519997
 max: 0.9995562314281493

---- 10 fold top 50 voxels 
 min: -0.0912459502536016
 m

### Matching test images

In [None]:
from numpy import dot
from numpy.linalg import norm

def cosim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [None]:
p1 = np.array([predictor.predict([X_test[0]])[0] for predictor in predictors])
p2 = np.array([predictor.predict([X_test[1]])[0] for predictor in predictors])
i1, i2 = y_test # 0, 1

print(f"similarity (p1, i1): {cosim(p1, i1)}")
print(f"similarity (p1, i2): {cosim(p1, i2)}")
print(f"similarity (p2, i1): {cosim(p2, i1)}")
print(f"similarity (p2, i2): {cosim(p2, i2)}")

### Visualizing the 

In [None]:
K = 80
ind = np.argpartition(scores, -K)[-K:]

plt.title("Semantic features embeddings")
plt.xlabel("test accuracy")
plt.ylabel("# voxels")
plt.hist(scores[ind], bins=30)
plt.grid()

In [None]:
# Picking the best 100 predictors per voxel
subset_scores = [np.mean(scores[np.argpartition(scores, -subset_size)[-subset_size:]]) for subset_size in range(1, 100)]

plt.title("top-K voxels accuracy")
plt.plot(subset_scores)
plt.ylabel("test accuracy")
plt.xlabel("K")
plt.grid()
plt.ylim([0, 1])

In [None]:
from scipy.stats import pearsonr

K = 20
ind = np.argpartition(scores, -K)[-K:]

# According to the last subject
def best_K_predict(X, indices, predictors):
    predictors = [predictors[idx] for idx in indices]
    y_hat = np.array([predictor.predict(X) for predictor in predictors]) # voxels, sample
    return y_hat.reshape(y_hat.shape[1], y_hat.shape[0]) # sample, voxels

y_hat = best_K_predict(X_train, ind, predictors)
y = y_train[:, ind]

RDM_hat = np.matmul(y_hat, np.matrix.transpose(y_hat))
# RDM_hat = (RDM_hat - RDM_hat.min()) / (RDM_hat.max() - RDM_hat.min())

RDM = np.matmul(y, np.matrix.transpose(y))
# RDM = (RDM - RDM.min()) / (RDM.max() - RDM.min())

test_pearson = pearsonr(
    RDM_hat.flatten(),
    RDM.flatten()
)

plt.subplot(121)
plt.title("Truth")
plt.imshow(RDM)
plt.colorbar()

plt.subplot(122)
plt.title("Prediction")
plt.imshow(RDM_hat)
plt.colorbar()