In [1]:
"""
For the subspace analysis, we have three methods. PCA, cvPCA, and cross-covariance analysis.

Each require the following:
# A recipe to get train and test data
# A recipe to get train place fields
# A way to fit the components
# And a way to measure the variance in the test data

So using a similar structure to the ABC RegressionModel, I can probably define a core structure as follows:

class SubspaceAnalysis:
    def fit(self, session, spks_type): ... # Returns the fit components and extras (data, placefields, etc)
    def score(self, session, fits, spks_type): ... # Returns the variance in the test data
    def reconstruction_score(self, session, fits, spks_type): ... # Returns the frobenius norm of the difference between the test data and the reconstructed data for each expanding subspace
    def get_scores(): ... # A similar cache method for getting scores without dealing with refitting which is slow

Note that I'll also need hyperparameters for measuring the placefields.... unfortunately this isn't just a user choice
because I should probably pick hyperparameters to maximize the variance the placefields explain in temporal data. 

But fortunately with some clever design I can build a structure that will 
allow me to make it easy to add or extend any one of the subspace methods. 
"""
print('bad docstring!')

bad docstring!


In [7]:
%reload_ext autoreload
%autoreload 2
%matplotlib qt

import random
import numpy as np
import torch
from matplotlib import pyplot as plt
from syd import make_viewer
from tqdm import tqdm

from vrAnalysis.database import get_database
from vrAnalysis.helpers import Timer, get_placefield_location, cross_validate_trials, sort_by_preferred_environment
from vrAnalysis.sessions import B2Session
from vrAnalysis.processors import SpkmapProcessor
from vrAnalysis.processors.support import median_zscore
from vrAnalysis.processors.placefields import get_placefield, get_frame_behavior, get_placefield_prediction
from dimilibi import Population
from dimilibi import ReducedRankRegression, RidgeRegression
from dimilibi import measure_r2, mse
from dimilibi import PCA, SVCA
from dimensionality_manuscript.registry import PopulationRegistry, get_subspace

# get session database
sessiondb = get_database("vrSessions")

# get population registry and models
registry = PopulationRegistry()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
submodel = get_subspace("pca_subspace", registry)
session = sessiondb.iter_sessions(imaging=True)[0]
spks_type = "oasis"
bprms, bscore, results = submodel._optimize_optuna(session, spks_type, "train", "not_train", n_trials=30)
print(bprms, bscore)
print(results)

Optuna search:   0%|          | 0/30 [00:00<?, ?it/s]

{'num_bins': 10, 'smooth_width': 16.267071394406756} 0.13626109063625336
    num_bins  smooth_width           score
0         53           NaN  tensor(0.3250)
1         11      1.840899  tensor(0.1806)
2         10     15.958574  tensor(0.1553)
3         15      2.294868  tensor(0.2256)
4         33           NaN  tensor(0.2929)
5         13     10.952663  tensor(0.1699)
6         28           NaN  tensor(0.2842)
7         38      7.475993  tensor(0.2944)
8         14           NaN  tensor(0.1963)
9         93           NaN  tensor(0.2726)
10        20     34.128905  tensor(0.2356)
11        10     16.267071  tensor(0.1363)
12        10     27.892316  tensor(0.1486)
13        22     48.678687  tensor(0.2390)
14        18     19.714910  tensor(0.2149)
15        10     24.051990  tensor(0.1658)
16        61      4.555236  tensor(0.2944)
17        24     10.834234  tensor(0.2575)
18        16      5.028560  tensor(0.2346)
19        12     32.377885  tensor(0.1963)
20        43     13.3597

In [2]:
# This code block runs the PCA version of the subspace analysis
# It estimates the primary modes of variance using the train split for both the 
# full activity and the place fields. Then, it measures the variance within those
# modes on the test split data - where we project the full activity in test on 
# either the PCs from the train full data or the PCs from the train place fields. 

session = random.choice(sessiondb.iter_sessions(imaging=True, session_params=dict(spks_type="oasis")))
print(session)

center = False # don't think this matters much
train_split = registry.time_split["half0"]
test_split = registry.time_split["half1"]
population, frame_behavior = registry.get_population(session, spks_type="oasis")

num_neurons = len(population.idx_neurons)
train_data = population.apply_split(population.data[population.idx_neurons], train_split, prefiltered=False)
test_data = population.apply_split(population.data[population.idx_neurons], test_split, prefiltered=False)
frame_behavior_train = frame_behavior.filter(population.get_split_times(train_split, within_idx_samples=False))

if center:
    train_data = train_data - train_data.mean(dim=1, keepdim=True)
    test_data = test_data - test_data.mean(dim=1, keepdim=True)

num_bins = 100
dist_edges = np.linspace(0, session.env_length[0], num_bins+1)
placefield = get_placefield(
    train_data.T.numpy(),
    frame_behavior_train,
    dist_edges=dist_edges,
    average=True,
    smooth_width=5.0,
)
placefield_extended = torch.tensor(placefield.placefield).reshape(-1, num_neurons).T

num_components = min(400, *train_data.shape, *test_data.shape, *placefield_extended.shape)
pca_all = PCA(num_components=num_components).fit(train_data)
pca_pos = PCA(num_components=num_components).fit(placefield_extended)  

train_data = train_data.to(device)
test_data = test_data.to(device)
components_all = pca_all.get_components().to(device)
components_pos = pca_pos.get_components().to(device)
train_variance_all = torch.var(train_data.T @ components_all, dim=0)
test_variance_all = torch.var(test_data.T @ components_all, dim=0)
train_variance_pos = torch.var(train_data.T @ components_pos, dim=0)
test_variance_pos = torch.var(test_data.T @ components_pos, dim=0)

# # Measure reconstruction error for expanding subspaces
# reconstruction_error_all = torch.zeros(num_components)
# reconstruction_error_pos = torch.zeros(num_components)
# for i in tqdm(range(num_components)):
#     c_comp_all = components_all[:, :i+1]
#     c_comp_pos = components_pos[:, :i+1]
#     recon_all = c_comp_all @ c_comp_all.T @ test_data
#     recon_pos = c_comp_pos @ c_comp_pos.T @ test_data
#     error_all = torch.norm(test_data - recon_all)
#     error_pos = torch.norm(test_data - recon_pos)
#     reconstruction_error_all[i] = error_all
#     reconstruction_error_pos[i] = error_pos

plt.close('all')
fig, ax = plt.subplots(3, 2, figsize=(10, 8), layout="constrained")
ax[0, 0].plot(train_variance_all.cpu(), c='k')
ax[0, 0].plot(test_variance_all.cpu(), c='b')
ax[0, 0].set_title("Comparing Train vs Test Full\nVariance")
ax[1, 0].plot(torch.cumsum(train_variance_all.cpu(), dim=0), c='k')
ax[1, 0].plot(torch.cumsum(test_variance_all.cpu(), dim=0), c='b')
ax[1, 0].set_title("Cumulative variance")
ax[2, 0].plot(torch.cumsum(test_variance_all.cpu(), dim=0) / torch.cumsum(train_variance_all.cpu(), dim=0), c='b')
ax[2, 0].set_title("Subspace Ratio")

ax[0, 1].plot(test_variance_all.cpu(), c='b')
ax[0, 1].plot(test_variance_pos.cpu(), c='g')
ax[0, 1].set_title("Comparing Test Full vs Test Placefields\nVariance")
ax[1, 1].plot(torch.cumsum(test_variance_all.cpu(), dim=0), c='b')
ax[1, 1].plot(torch.cumsum(test_variance_pos.cpu(), dim=0), c='g')
ax[1, 1].set_title("Cumulative variance")
ax[2, 1].plot(torch.cumsum(test_variance_pos.cpu(), dim=0) / torch.cumsum(test_variance_all.cpu(), dim=0), c='g')
ax[2, 1].set_title("Subspace Ratio")
plt.show()

B2Session(mouse_name='ATL022', date='2023-05-03', session_id='701', spks_type='oasis')


In [None]:
# This code block runs the cv PCA version of the subspace analysis
session = random.choice(sessiondb.iter_sessions(imaging=True, session_params=dict(spks_type="oasis")))
print(session)

train0_split = registry.time_split["train0"]
train1_split = registry.time_split["train1"]
test_split = registry.time_split["not_train"]
population, frame_behavior = registry.get_population(session, spks_type="oasis")

num_neurons = len(population.idx_neurons)
train0_data = population.apply_split(population.data[population.idx_neurons], train0_split, prefiltered=False)
train1_data = population.apply_split(population.data[population.idx_neurons], train1_split, prefiltered=False)
test_data = population.apply_split(population.data[population.idx_neurons], test_split, prefiltered=False)
frame_behavior_train0 = frame_behavior.filter(population.get_split_times(train0_split, within_idx_samples=False))
frame_behavior_train1 = frame_behavior.filter(population.get_split_times(train1_split, within_idx_samples=False))

if train0_data.shape[1] != train1_data.shape[1]:
    num_samples = min(train0_data.shape[1], train1_data.shape[1])
    idx_train0 = np.sort(np.random.choice(train0_data.shape[1], num_samples, replace=False))
    idx_train1 = np.sort(np.random.choice(train1_data.shape[1], num_samples, replace=False))
    train0_data = train0_data[:, idx_train0]
    train1_data = train1_data[:, idx_train1]
    frame_behavior_train0 = frame_behavior_train0.filter(idx_train0)
    frame_behavior_train1 = frame_behavior_train1.filter(idx_train1)

num_bins = 100
dist_edges = np.linspace(0, session.env_length[0], num_bins+1)
placefield0 = get_placefield(
    train0_data.T.numpy(),
    frame_behavior_train0,
    dist_edges=dist_edges,
    average=True,
    smooth_width=5.0,
)
placefield1 = get_placefield(
    train1_data.T.numpy(),
    frame_behavior_train1,
    dist_edges=dist_edges,
    average=True,
    smooth_width=5.0,
)
placefield0_extended = torch.tensor(placefield0.placefield).reshape(-1, num_neurons).T
placefield1_extended = torch.tensor(placefield1.placefield).reshape(-1, num_neurons).T

num_components = min(400, *train0_data.shape, *train1_data.shape, *test_data.shape, *placefield0_extended.shape, *placefield1_extended.shape)

centered = True
svca_all = SVCA(centered=centered, num_components=num_components)
svca_all = svca_all.fit(train0_data, train1_data)
svca_pos = SVCA(centered=centered, num_components=num_components)
svca_pos = svca_pos.fit(placefield0_extended, placefield1_extended)

components_all = svca_all.U
components_pos = svca_pos.U
train_variance_all = torch.var(train_data.T @ components_all, dim=0)
test_variance_all = torch.var(test_data.T @ components_all, dim=0)
train_variance_pos = torch.var(train_data.T @ components_pos, dim=0)
test_variance_pos = torch.var(test_data.T @ components_pos, dim=0)
plt.close('all')
fig, ax = plt.subplots(3, 2, figsize=(10, 8), layout="constrained")
ax[0, 0].plot(train_variance_all, c='k')
ax[0, 0].plot(test_variance_all, c='b')
ax[0, 0].set_title("Comparing Train vs Test Full\nVariance")
ax[1, 0].plot(torch.cumsum(train_variance_all, dim=0), c='k')
ax[1, 0].plot(torch.cumsum(test_variance_all, dim=0), c='b')
ax[1, 0].set_title("Cumulative variance")
ax[2, 0].plot(torch.cumsum(test_variance_all, dim=0) / torch.cumsum(train_variance_all, dim=0), c='b')
ax[2, 0].set_title("Subspace Ratio")

ax[0, 1].plot(test_variance_all, c='b')
ax[0, 1].plot(test_variance_pos, c='g')
ax[0, 1].set_title("Comparing Test Full vs Test Placefields\nVariance")
ax[1, 1].plot(torch.cumsum(test_variance_all, dim=0), c='b')
ax[1, 1].plot(torch.cumsum(test_variance_pos, dim=0), c='g')
ax[1, 1].set_title("Cumulative variance")
ax[2, 1].plot(torch.cumsum(test_variance_pos, dim=0) / torch.cumsum(test_variance_all, dim=0), c='g')
ax[2, 1].set_title("Subspace Ratio")
plt.show()

B2Session(mouse_name='ATL020', date='2023-04-11', session_id='701', spks_type='oasis')
torch.Size([7860, 3448]) torch.Size([7860, 3448]) torch.Size([7860, 1725]) torch.Size([7860, 200]) torch.Size([7860, 200])


In [80]:
components_all = svca_all.U
components_pos = svca_pos.U
train_variance_all = torch.var(train0_data.T @ components_all, dim=0)
test_variance_all = torch.var(test_data.T @ components_all, dim=0)
train_variance_pos = torch.var(train0_data.T @ components_pos, dim=0)
test_variance_pos = torch.var(test_data.T @ components_pos, dim=0)
plt.close('all')
fig, ax = plt.subplots(3, 2, figsize=(10, 8), layout="constrained")
ax[0, 0].plot(train_variance_all, c='k')
ax[0, 0].plot(test_variance_all, c='b')
ax[0, 0].set_title("Comparing Train vs Test Full\nVariance")
ax[1, 0].plot(torch.cumsum(train_variance_all, dim=0), c='k')
ax[1, 0].plot(torch.cumsum(test_variance_all, dim=0), c='b')
ax[1, 0].set_title("Cumulative variance")
ax[2, 0].plot(torch.cumsum(test_variance_all, dim=0) / torch.cumsum(train_variance_all, dim=0), c='b')
ax[2, 0].set_title("Subspace Ratio")

ax[0, 1].plot(test_variance_all, c='b')
ax[0, 1].plot(test_variance_pos, c='g')
ax[0, 1].set_title("Comparing Test Full vs Test Placefields\nVariance")
ax[1, 1].plot(torch.cumsum(test_variance_all, dim=0), c='b')
ax[1, 1].plot(torch.cumsum(test_variance_pos, dim=0), c='g')
ax[1, 1].set_title("Cumulative variance")
ax[2, 1].plot(torch.cumsum(test_variance_pos, dim=0) / torch.cumsum(test_variance_all, dim=0), c='g')
ax[2, 1].set_title("Subspace Ratio")
plt.show()

In [69]:
# This code block runs the SVCA version of the subspace analysis

session = random.choice(sessiondb.iter_sessions(imaging=True, session_params=dict(spks_type="oasis")))
print(session)

train_split = registry.time_split["half0"]
test_split = registry.time_split["half1"]
population, frame_behavior = registry.get_population(session, spks_type="oasis")

num_source_neurons = len(population.cell_split_indices[0])
num_target_neurons = len(population.cell_split_indices[1])
train_source, train_target = population.get_split_data(train_split)
test_source, test_target = population.get_split_data(test_split)
frame_behavior_train = frame_behavior.filter(population.get_split_times(train_split, within_idx_samples=False))

num_bins = 100
dist_edges = np.linspace(0, session.env_length[0], num_bins+1)
placefield_source = get_placefield(
    train_source.T.numpy(),
    frame_behavior_train,
    dist_edges=dist_edges,
    average=True,
    smooth_width=5.0,
)
placefield_target = get_placefield(
    train_target.T.numpy(),
    frame_behavior_train,
    dist_edges=dist_edges,
    average=True,
    smooth_width=5.0,
)
placefield_source_extended = torch.tensor(placefield_source.placefield).reshape(-1, num_source_neurons).T
placefield_target_extended = torch.tensor(placefield_target.placefield).reshape(-1, num_target_neurons).T

num_components = min(*train_source.shape, *train_target.shape, *placefield_source_extended.shape, *placefield_target_extended.shape)

centered = True
svca_all = SVCA(centered=centered, num_components=num_components)
svca_all = svca_all.fit(train_source, train_target)
svca_pos = SVCA(centered=centered, num_components=num_components)
svca_pos = svca_pos.fit(placefield_source_extended, placefield_target_extended)

sv_all_train = svca_all.score(train_source, train_target)[0]
sv_all_test = svca_all.score(test_source, test_target)[0]
sv_pos_train = svca_pos.score(train_source, train_target)[0]
sv_pos_test = svca_pos.score(test_source, test_target)[0]

plt.close('all')
fig, ax = plt.subplots(3, 2, figsize=(10, 8), layout="constrained")
ax[0, 0].plot(sv_all_train, c='k')
ax[0, 0].plot(sv_all_test, c='b')
ax[0, 0].set_title("Comparing Train vs Test Full\nVariance")
ax[1, 0].plot(torch.cumsum(sv_all_train, dim=0), c='k')
ax[1, 0].plot(torch.cumsum(sv_all_test, dim=0), c='b')
ax[1, 0].set_title("Cumulative variance")
ax[2, 0].plot(torch.cumsum(sv_all_test, dim=0) / torch.cumsum(sv_all_train, dim=0), c='b')
ax[2, 0].set_title("Subspace Ratio")

ax[0, 1].plot(sv_all_test, c='b')
ax[0, 1].plot(sv_pos_test, c='g')
ax[0, 1].set_title("Comparing Test Full vs Test Placefields\nVariance")
ax[1, 1].plot(torch.cumsum(sv_all_test, dim=0), c='b')
ax[1, 1].plot(torch.cumsum(sv_pos_test, dim=0), c='g')
ax[1, 1].set_title("Cumulative variance")
ax[2, 1].plot(torch.cumsum(sv_pos_test, dim=0) / torch.cumsum(sv_all_test, dim=0), c='g')
ax[2, 1].set_title("Subspace Ratio")
plt.show()

B2Session(mouse_name='ATL058', date='2024-07-18', session_id='701', spks_type='oasis')


In [None]:
# This block runs a simulation for how the PCA version of the subspace analysis will work if 
# the train and test data are generated from slightly different subspaces. 

num_samples = 100000
num_neurons = 100
num_components = 100

# Generate eigenspectrum
eigenvalues = 1.2 ** -torch.arange(num_components)
same_fraction = 0.5
_ev_shared = torch.randn(num_neurons, num_neurons)
eigenvectors_train = torch.linalg.qr(same_fraction * _ev_shared + (1-same_fraction) * torch.randn(num_neurons, num_neurons)).Q[:, :num_components]
eigenvectors_test = torch.linalg.qr(same_fraction * _ev_shared + (1-same_fraction) * torch.randn(num_neurons, num_neurons)).Q[:, :num_components]

# Generate random data
train_data = eigenvectors_train @ torch.diag(torch.sqrt(eigenvalues)) @ torch.randn(num_components, num_samples)
test_data = eigenvectors_test @ torch.diag(torch.sqrt(eigenvalues)) @ torch.randn(num_components, num_samples)

# Measure the eigenspectrum of the data
train_data_cov = train_data @ train_data.T / num_samples
eigenvalues_train, _ = torch.linalg.eigh(train_data_cov)
eigenvalues_train = torch.flip(eigenvalues_train, dims=(0,))

# Plot eigenspectrum
plt.close('all')
plt.plot(eigenvalues, c='k')
plt.plot(eigenvalues_train, c='b')
plt.show()

# Now run PCA on train data and measure total variance in each expanding subspace of test data
train_pca = PCA(num_components=num_components).fit(train_data)

# Then go and measure the total variance in each expanding subspace of test data
components = train_pca.get_components()
cv_variance = torch.zeros(num_components)
exp_variance = torch.zeros(num_components)
for i in tqdm(range(num_components)):
    exp_variance[i] = torch.var(components[:, :i+1].T @ train_data, dim=1).sum()
    cv_variance[i] = torch.var(components[:, :i+1].T @ test_data, dim=1).sum()
    
# If the subspaces differ, the blue curve should take a while to catch up to the black curve. 
plt.close('all')
plt.plot(exp_variance+2, c='k')
plt.plot(cv_variance+2, c='b')
plt.show()