### Semantic features as word embeddings

In [1]:
import numpy as np
import os
import torch
import json
from tqdm import tqdm
import scipy

### Semantic features

In [2]:
min_len_line = 5
N_SEMANTIC_FEATURES = 25
semantic_features = {}

def dump_mitchell_web_semantic_features(raw_file = os.path.join("data","mitchell_semantic_raw.txt")):
    with open(raw_file, "r") as datafile:
        lines = datafile.readlines()
        word = None

        for line in lines:

            # Skip empty
            if len(line) >= min_len_line:

                # New feature
                if "Features for" in line:

                    # Discard invalid ones (once fully parsed)
                    if word and len(semantic_features[word]['features']) < N_SEMANTIC_FEATURES: del semantic_features[word] 
                        
                    word = line.split("<a name=\"")[1].split("\"")[0]
                    semantic_features[word] = { "features": [], "values": []}

                elif word:
                    feature_name = line.split("(")[0]
                    val = float(line.split("(")[1].split(")")[0])
                    semantic_features[word]["features"].append(feature_name)
                    semantic_features[word]["values"].append(val)

    # Save to file
    #with open(os.path.join('data', 'mitchell_semantic_features.json'), 'w') as fp:
    #    json.dump(semantic_features, fp)

    return semantic_features


def load_sorted_semantic_features(file = os.path.join("data","mitchell_semantic_features.json")):
    with open(file) as f:
        semantic_features = json.load(f)
        for word in semantic_features.keys():
            # Sort all features
            sorted_features = sorted(enumerate(semantic_features[word]["features"]), key=lambda x:x[1])
            sorted_indices = [i[0] for i in sorted_features]
            sorted_values = [semantic_features[word]["values"][i] for i in sorted_indices]

            # Re-store them
            semantic_features[word]["features"] = [x[1] for x in sorted_features]
            semantic_features[word]["values"] = sorted_values
            break

    return semantic_features
            

In [3]:
semantic_features = load_sorted_semantic_features()

### fMRI data

In [4]:
def get_mitchell_original_data(subject = 1, random_voxels = None):
    mdata = scipy.io.loadmat(os.path.join("data", "mitchell", f"mitchell_subject_{subject}.mat"))
    subject_data = {}

    # 6 x 60 trials
    for i in range(mdata["data"][:].shape[0]):
        cond, cond_number, word, word_number, epoch = [x[0] for x in mdata["info"][0][i]]

        # Set trial data
        if epoch[0] not in subject_data: subject_data[epoch[0]] = {}

        if random_voxels:
            random_voxels_idx = np.random.choice(mdata["data"][i][0][0].shape[0], random_voxels)
            subject_data[epoch[0]][word] = mdata["data"][i][0][0][random_voxels_idx]
        else: subject_data[epoch[0]][word] = mdata["data"][i][0][0]

    return subject_data

In [5]:
epoch = 1
fmriData = get_mitchell_original_data(subject=1)[epoch]

### Predicting

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import PLSRegression

from tqdm import tqdm
from numpy import dot
from numpy.linalg import norm

def cosim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [7]:
def voxels_compute_fold_accuracy(predictors, X_test, y_test, selected_voxels):

    true_positives = []

    for i in range(X_test.shape[0]):

        pi = np.array([predictor.predict([X_test[i]])[0] for x, predictor in enumerate(predictors) if x in selected_voxels]).flatten()
        best_sample_match = np.argmax([cosim(pi, y_test[j, selected_voxels]) for j in range(y_test.shape[0])])
    
        true_positives.append(int(best_sample_match == i)) # ground truth is aligned with the sample by index, it should match

    return np.mean(true_positives)


def regressor_compute_fold_accuracy(predictors, X_test, y_test):

    true_positives = []

    # For each sample prediction -> cosine similarity with all truth images
    for i in range(X_test.shape[0]):

        pi = predictors.predict([X_test[i]])
        best_sample_match = np.argmax([cosim(pi, y_test[j]) for j in range(y_test.shape[0])])

        true_positives.append(int(best_sample_match == i)) # ground truth is aligned with the sample by index, it should match

    return np.mean(true_positives)

In [8]:
k_folds = 30
n_samples = len(semantic_features.keys())

assert n_samples % k_folds == 0, "Number of folds must divide the samples in equal parts. Choose a valid multiplier."

samples_per_fold = (n_samples // k_folds)
n_voxels = fmriData["bell"].shape[0]
VOXELWISE_ACC_THRESHOLD = 0.2
n_samples, samples_per_fold, n_voxels

(60, 2, 21764)

In [9]:
embedding_len = len(semantic_features["bell"]["values"])
# we need to remove the correct one, but the index changes for each word. 
# that's why it performs badly! apply string sorting while processing, before converting to int values

In [10]:
accuracies = []
voxels_amounts = []
voxels_counts = np.zeros(n_voxels)

for i in tqdm(range(k_folds)):

    # Extracting filtered (most stable voxels) training set
    train_indices = np.array(list(range(samples_per_fold * i)) + list(range((samples_per_fold * (i+1)), n_samples)), dtype=np.int32)
    test_indices = np.array(list(range((samples_per_fold * i), samples_per_fold * (i + 1))), dtype=np.int32)

    # Building train set
    X = []
    Y = []

    for word in semantic_features.keys():
        if word in fmriData.keys():
            x = np.array(semantic_features[word]["values"])
            y = np.array(fmriData[word])
            X.append(x)
            Y.append(y)

    X = np.array(X)
    Y = np.array(Y)

    # Train-test split
    X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], Y[train_indices], Y[test_indices]
    
    # Normalization based on train data
    normalizer = StandardScaler()
    normalizer.fit(X_train)

    X_train = normalizer.transform(X_train)
    X_test = normalizer.transform(X_test)

    # Predicting & scoring
    predictors = [LinearRegression() for i in range(n_voxels)]
    voxel_regressor = True
    scores = []

    j = 0

    # Fit each voxel predictor
    for model in predictors:
        model.fit(X_train, y_train[:, j])
        scores.append(model.score(X_test, y_test[:, j]))
        j += 1

    # Select predictors by R2 score and compute 2 words accuracy
    scores = np.array(scores)
    voxel_indices = np.where(scores > VOXELWISE_ACC_THRESHOLD)[0]
    
    voxels_amounts.append(voxel_indices.shape[0])
    fold_accuracy = voxels_compute_fold_accuracy(predictors, X_test, y_test, voxel_indices)
    print(f"Fold-{i} \t accuracy: {fold_accuracy} \t voxels: {len(voxel_indices)}")
    
    # With the chosen voxels, fit a multivariate regressor as a test
    predictors = Ridge()
    predictors.fit(X_train, y_train[:, voxel_indices])
    multivar_fold_accuracy = regressor_compute_fold_accuracy(predictors, X_test, y_test[:, voxel_indices])
    
    print(f"\t multivar. regressor: {multivar_fold_accuracy}")

    # Keep track of the chosen voxels
    voxels_counts[voxel_indices] += 1

    accuracies.append(fold_accuracy)

np.mean(accuracies), np.mean(voxels_amounts)

  3%|▎         | 1/30 [00:14<07:08, 14.78s/it]

Fold-0 	 accuracy: 1.0 	 voxels: 10
	 multivar. regressor: 1.0


  7%|▋         | 2/30 [00:29<06:44, 14.45s/it]

Fold-1 	 accuracy: 1.0 	 voxels: 765
	 multivar. regressor: 1.0


 10%|█         | 3/30 [00:43<06:29, 14.44s/it]

Fold-2 	 accuracy: 1.0 	 voxels: 2766
	 multivar. regressor: 1.0


 13%|█▎        | 4/30 [00:58<06:19, 14.60s/it]

Fold-3 	 accuracy: 1.0 	 voxels: 1606
	 multivar. regressor: 1.0


 17%|█▋        | 5/30 [01:12<06:00, 14.42s/it]

Fold-4 	 accuracy: 1.0 	 voxels: 1078
	 multivar. regressor: 1.0


 20%|██        | 6/30 [01:26<05:45, 14.42s/it]

Fold-5 	 accuracy: 1.0 	 voxels: 2490
	 multivar. regressor: 1.0


 23%|██▎       | 7/30 [01:41<05:30, 14.36s/it]

Fold-6 	 accuracy: 1.0 	 voxels: 2011
	 multivar. regressor: 1.0


 27%|██▋       | 8/30 [01:55<05:15, 14.36s/it]

Fold-7 	 accuracy: 1.0 	 voxels: 1641
	 multivar. regressor: 1.0


 30%|███       | 9/30 [02:10<05:03, 14.47s/it]

Fold-8 	 accuracy: 1.0 	 voxels: 3035
	 multivar. regressor: 1.0


 33%|███▎      | 10/30 [02:24<04:48, 14.42s/it]

Fold-9 	 accuracy: 1.0 	 voxels: 1538
	 multivar. regressor: 1.0


 37%|███▋      | 11/30 [02:38<04:33, 14.38s/it]

Fold-10 	 accuracy: 1.0 	 voxels: 2955
	 multivar. regressor: 1.0


 40%|████      | 12/30 [02:53<04:20, 14.50s/it]

Fold-11 	 accuracy: 1.0 	 voxels: 2201
	 multivar. regressor: 1.0


 43%|████▎     | 13/30 [03:08<04:06, 14.51s/it]

Fold-12 	 accuracy: 1.0 	 voxels: 1610
	 multivar. regressor: 1.0


 47%|████▋     | 14/30 [03:22<03:51, 14.48s/it]

Fold-13 	 accuracy: 1.0 	 voxels: 3391
	 multivar. regressor: 1.0


 50%|█████     | 15/30 [03:37<03:38, 14.54s/it]

Fold-14 	 accuracy: 1.0 	 voxels: 2070
	 multivar. regressor: 1.0


 53%|█████▎    | 16/30 [03:51<03:22, 14.43s/it]

Fold-15 	 accuracy: 1.0 	 voxels: 2085
	 multivar. regressor: 1.0


 57%|█████▋    | 17/30 [04:05<03:07, 14.40s/it]

Fold-16 	 accuracy: 1.0 	 voxels: 2376
	 multivar. regressor: 1.0


 60%|██████    | 18/30 [04:19<02:52, 14.37s/it]

Fold-17 	 accuracy: 1.0 	 voxels: 3352
	 multivar. regressor: 1.0


 63%|██████▎   | 19/30 [04:34<02:38, 14.44s/it]

Fold-18 	 accuracy: 1.0 	 voxels: 2032
	 multivar. regressor: 1.0


 67%|██████▋   | 20/30 [04:50<02:29, 14.90s/it]

Fold-19 	 accuracy: 1.0 	 voxels: 3230
	 multivar. regressor: 0.5


 70%|███████   | 21/30 [05:05<02:13, 14.80s/it]

Fold-20 	 accuracy: 1.0 	 voxels: 1977
	 multivar. regressor: 1.0


 73%|███████▎  | 22/30 [05:19<01:58, 14.83s/it]

Fold-21 	 accuracy: 1.0 	 voxels: 2926
	 multivar. regressor: 0.5


 77%|███████▋  | 23/30 [05:34<01:43, 14.84s/it]

Fold-22 	 accuracy: 1.0 	 voxels: 1893
	 multivar. regressor: 0.5


 80%|████████  | 24/30 [05:49<01:28, 14.74s/it]

Fold-23 	 accuracy: 1.0 	 voxels: 2501
	 multivar. regressor: 1.0


 83%|████████▎ | 25/30 [06:04<01:13, 14.73s/it]

Fold-24 	 accuracy: 1.0 	 voxels: 2499
	 multivar. regressor: 0.5


 87%|████████▋ | 26/30 [06:18<00:58, 14.64s/it]

Fold-25 	 accuracy: 1.0 	 voxels: 2319
	 multivar. regressor: 1.0


 90%|█████████ | 27/30 [06:32<00:43, 14.59s/it]

Fold-26 	 accuracy: 1.0 	 voxels: 2670
	 multivar. regressor: 0.5


 93%|█████████▎| 28/30 [06:47<00:29, 14.54s/it]

Fold-27 	 accuracy: 0.5 	 voxels: 2663
	 multivar. regressor: 0.5


 97%|█████████▋| 29/30 [07:01<00:14, 14.57s/it]

Fold-28 	 accuracy: 1.0 	 voxels: 2057
	 multivar. regressor: 1.0


100%|██████████| 30/30 [07:16<00:00, 14.56s/it]

Fold-29 	 accuracy: 1.0 	 voxels: 3493
	 multivar. regressor: 1.0





(0.9833333333333333, 2241.3333333333335)

In [104]:
# Determine a subset of voxels that is common across folds and from which we can perform sound multivariate regression
VOXELCOUNT_THRESHOLD = 0.25 # voxels that best perform in 20% of folds
chosen_voxels = np.where((voxels_counts / k_folds) > VOXELCOUNT_THRESHOLD)[0]
chosen_voxels

array([   63,    68,   235,   269,   293,   363,   566,   572,   697,
         705,   792,   843,   845,   846,   875,  1108,  1159,  1183,
        1256,  1264,  1292,  1460,  1552,  1574,  1662,  1674,  1885,
        1895,  1914,  1920,  2007,  2021,  2035,  2053,  2101,  2178,
        2226,  2678,  2741,  2949,  3006,  3046,  3049,  3100,  3284,
        3357,  3371,  3392,  3935,  3966,  3989,  4023,  4069,  4078,
        4155,  4259,  4277,  4326,  4371,  4489,  4507,  4508,  4512,
        4513,  4736,  4737,  4738,  4773,  4774,  4776,  4889,  4965,
        5082,  5112,  5402,  5605,  5803,  5816,  5825,  6000,  6029,
        6212,  6214,  6237,  6299,  6309,  6343,  6475,  6513,  6515,
        6516,  6518,  6527,  6540,  6650,  6660,  6665,  6666,  6886,
        6891,  6894,  6965,  7070,  7113,  7161,  7162,  7197,  7343,
        7389,  7462,  7469,  7740,  7801,  7869,  7883,  7892,  7898,
        7994,  8017,  8034,  8170,  8227,  8234,  8237,  8463,  8541,
        8553,  8887,

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

normalizer = StandardScaler()
normalizer.fit(X_train)

X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

In [106]:
# Selected voxels
predictors = LinearRegression()
predictors.fit(X_train, y_train[:, chosen_voxels])
multivar_fold_accuracy = regressor_compute_fold_accuracy(predictors, X_test, y_test[:, chosen_voxels])
multivar_fold_accuracy

0.6666666666666666

In [107]:
# ALL voxels
predictors = LinearRegression()
predictors.fit(X_train, y_train)
multivar_fold_accuracy = regressor_compute_fold_accuracy(predictors, X_test, y_test)
multivar_fold_accuracy

0.3333333333333333

### Testing feature reduction

In [None]:
"""
for omitted_feature in range(embedding_len): 
    for i in tqdm(range(k_folds)):

        # Extracting filtered (most stable voxels) training set
        train_indices = np.array(list(range(samples_per_fold * i)) + list(range((samples_per_fold * (i+1)), n_samples)), dtype=np.int32)
        test_indices = np.array(list(range((samples_per_fold * i), samples_per_fold * (i + 1))), dtype=np.int32)

        # Building train set
        X = []
        Y = []

        for word in semantic_features.keys():
            if word in fmriData.keys():
                x = np.array([f for k, f in enumerate(semantic_features[word]["values"]) if k != omitted_feature])
                y = np.array(fmriData[word])

                X.append(x)
                Y.append(y)

        X = np.array(X)
        Y = np.array(Y)

        # Train-test split
        X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], Y[train_indices], Y[test_indices]
        
        # Normalization based on train data
        normalizer = StandardScaler()
        normalizer.fit(X_train)

        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)

        # Predicting & scoring
        predictors = [LinearRegression() for i in range(n_voxels)]
        scores = []

        # One predictor per voxel
        j = 0
        for model in predictors:
            model.fit(X_train, y_train[:, j])
            scores.append(model.score(X_test, y_test[:, j]))
            j += 1
        
        # Select voxels by R2 score and compute 2 words accuracy
        scores = np.array(scores)
        voxel_indices = np.where(scores > THRESHOLD)[0]
        
        voxels_amounts.append(voxel_indices.shape[0])
        fold_accuracy = voxels_compute_fold_accuracy(predictors, X_test, y_test, voxel_indices)
        
        print(f"Fold-{i} \t accuracy: {fold_accuracy} \t voxels: {len(voxel_indices)}")
        accuracies.append(fold_accuracy)

    print(f"Feature omitted: {omitted_feature}")
    print(f"Accuracy: {np.mean(accuracies)} \t Voxels: {np.mean(voxels_amounts)}\n")
"""


**Observation**

In this case fitting is way more expensive, as 21k voxels are considered.

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

def best_K_predict(X, indices, predictors):
    predictors = [predictors[idx] for idx in indices]
    y_hat = np.array([predictor.predict(X) for predictor in predictors]) # voxels, sample
    return y_hat.reshape(y_hat.shape[1], y_hat.shape[0]) # sample, voxels

# voxel_indices

y_hat = best_K_predict(X_train, voxel_indices, predictors)
y = y_train[:, voxel_indices]

RDM_hat = np.matmul(y_hat, np.matrix.transpose(y_hat))

RDM = np.matmul(y, np.matrix.transpose(y))

test_pearson = pearsonr(
    RDM_hat.flatten(),
    RDM.flatten()
)

print(f"Test RDMs R^2:\t{test_pearson}")

plt.subplot(121)
plt.title("Truth")
plt.imshow(RDM)
plt.colorbar()

plt.subplot(122)
plt.title("Prediction")
plt.imshow(RDM_hat)
plt.colorbar()

**Observation**

Here the the voxels from the last cross_val iteration have been selected. For these voxels, the object to object distance matrices have similar patterns.