## Ext. Data Figure 9: Model Simulations Recapitulate Characteristics of Real Data

This notebook recreates the figure panels included in Figure Extended Data Figure 9 of [Bolkan, Stone et al 2021](https://www.biorxiv.org/content/10.1101/2021.07.23.453573v1). 

The general premise of this notebook/figure, in the context of the paper, is to show that fitting the model to simulated datasets produces similar results, in terms of the transition probabilities and state characteristics, of the real data. This serves as a sanity check to ensure that the model can indeed capture fundamental properties of the real data in its simulations. 

### Simulate data
#### Import the required code packages and modules

In [4]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import pickle
from glmhmm import glm, glm_hmm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load the model objects from fitting the GLM-HMM to the real data

In [5]:
GLMHMM_d2 = pickle.load(open('fit models/GLMHMM_d2.pickle','rb'))
GLMHMM_d1 = pickle.load(open('fit models/GLMHMM_d1.pickle','rb'))

#### Load the real data

In [6]:
# load the data for the indirect pathway cohort
x_d2 = np.load('data/indirect_x.npy') # z-scored design matrix
y_d2 = np.load('data/indirect_y.npy') # vector of right and left choices for each trial
sessions_d2 = np.load('data/indirect_sessions.npy') # vector of session start and stop indices
mouseIDs_d2 = np.load('data/indirect_mouseIDs.npy') # vector of mouse IDs for each trial
trialTypes_d2 = np.load('data/indirect_trialTypes.npy') # vector of trial types (which side rewarded) for each trial

# load the data for the direct pathway cohort
x_d1 = np.load('data/direct_x.npy') # z-scored design matrix
y_d1 = np.load('data/direct_y.npy') # vector of right and left choices for each trial
sessions_d1 = np.load('data/direct_sessions.npy') # vector of session start and stop indices
mouseIDs_d1 = np.load('data/direct_mouseIDs.npy') # vector of mouse IDs for each trial
trialTypes_d1 = np.load('data/direct_trialTypes.npy') # vector of trial types (which side rewarded) for each trial

#### Simulate five datasets

In [7]:
num_sims = 5
simulated_x_d2 = np.zeros((num_sims,x_d2.shape[0],x_d2.shape[1]))
simulated_y_d2 = np.zeros((num_sims,y_d2.shape[0]))
simulated_x_d1 = np.zeros((num_sims,x_d1.shape[0],x_d1.shape[1]))
simulated_y_d1 = np.zeros((num_sims,y_d1.shape[0]))
for i in range(num_sims):
    ## indirect pathway cohort --------------------------------------------------------
    simulated_x_d2[i], simulated_y_d2[i], _ = GLMHMM_d2.generate_data_from_fit(GLMHMM_d2.w,GLMHMM_d2.A,x_d2,
                                                                             obs_ix=[3,9,10],replace=True,
                                                                             sessions=sessions_d2,
                                                                             outcomes=trialTypes_d2)
    ## direct pathway cohort --------------------------------------------------------
    simulated_x_d1[i], simulated_y_d1[i], _ = GLMHMM_d1.generate_data_from_fit(GLMHMM_d1.w,GLMHMM_d1.A,x_d1,
                                                                             obs_ix=[3,9,10],replace=True,
                                                                             sessions=sessions_d1,
                                                                             outcomes=trialTypes_d1)

### Fit GLM-HMMs to the simulated data
#### Set the hyperparameters

In [9]:
N_d2 = x_d2.shape[0] # number of data/time points for the indirect pathway cohort
N_d1 = x_d1.shape[0]
K = 3 # number of latent states
C = 2 # number of observation classes
D = x_d2.shape[1] # number of GLM inputs (regressors)

This will take about 20 hours to run in this notebook (5 simulated datasets x 2 cohorts x 2 hours per fit). Best to parallelize this and run each cohort and fit separately if you don't want to wait so long!

In [None]:
inits = 20 # set the number of initializations
maxiter = 250 # maximum number of iterations of EM to allow for each fit
tol = 1e-3

# store model objects for each simulated dataset
best_GLMHMMs_d2 = np.zeros((num_sims), dtype=object)
best_GLMHMMs_d1 = np.zeros((num_sims), dtype=object)

for j in range(num_sims):

    # store values for each initialization
    lls_all_d2 = np.zeros((inits,250))
    GLMHMMs_d2 = np.zeros((inits),dtype=object)
    lls_all_d1 = np.zeros((inits,250))
    GLMHMMs_d1 = np.zeros((inits),dtype=object)

    # fit the model for each initialization
    for i in range(inits):
        ## indirect pathway cohort --------------------------------------------------------
        GLMHMMs_d2[i] = glm_hmm.GLMHMM(N_d2,D,C,K,observations="bernoulli",gaussianPrior=1)
        A_init,w_init,_ = GLMHMMs_d2[i].generate_params(weights=['GLM',-0.2,1.2,x_d2,y_d2,1])
        lls_all_d2[i,:],_,_,_ = GLMHMMs_d2[i].fit(y_d2,x_d2,A_init,w_init,maxiter=maxiter,tol=tol,sess=sessions_d2)

        ## direct pathway cohort ----------------------------------------------------------
        GLMHMMs_d1[i] = glm_hmm.GLMHMM(N_d1,D,C,K,observations="bernoulli",gaussianPrior=1)
        A_init,w_init,_ = GLMHMMs_d1[i].generate_params(weights=['GLM',-0.2,1.2,x_d1,y_d1,1])
        lls_all_d1[i,:],_,_,_ = GLMHMMs_d1[i].fit(y_d1,x_d1,A_init,w_init,maxiter=maxiter,tol=tol,sess=sessions_d1)

    # find the initialization that led to the best fit
    bestix_d2 = find_best_fit(lls_all_d2)
    best_GLMHMMs_d2[j] = GLMHMMs_d2[bestix_d2]
    bestix_d1 = find_best_fit(lls_all_d1)
    best_GLMHMMs_d1[j] = real_GLMHMMs_d1[bestix_d1]    
    
# save results in case we want to load them again later
pickle.dump(best_GLMHMMs_d2, open('fit models/simulated_GLMHMMs_d2.pickle', 'wb'))
pickle.dump(best_GLMHMMs_d1, open('fit models/simulated_GLMHMMs_d1.pickle', 'wb'))

### Ext. Data Figure 9A/B: Compare the Transition Probabilities