#### This script is part of the control analysis part of the project and it is used to check if we can consider all the trajectories in the dataset to come from the same distribution. Hence, for each trajectory we check that all the values of each dimension are part of the distributions that can be considered to be part of the same distribution. For this purpose, K-S test is employed. 

## Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy.io
import itertools as it
import scipy.special as psi
plt.style.use('classic')
import seaborn as sns
import pandas as pd
import time 

from scipy.io import loadmat
from scipy import stats
from numpy.random import seed
from numpy.random import rand
from scipy.integrate import quad
from scipy.io import savemat
from tempfile import TemporaryFile
from scipy.io import loadmat
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.decomposition import KernelPCA
from mpl_toolkits import mplot3d
from mPE_fn import mPE
from scipy.spatial import distance
from scipy.stats import entropy

## Load and Clean Data

In [3]:
modes = ['normal', 'drug', 'vehicle']
root_dir = '/rds/general/user/lr4617/home/4th_Year_Project/CAPTURE_rat_multidimensional/raw_data/'
# load entire high-dimensional trajectories
for mode in modes:
    trajs = os.listdir(root_dir + mode + '/')
    for traj_n in trajs:
        # loading entire high-dimensional trajectory
        path = root_dir + mode + '/' + traj_n + '/'
        trajectories = os.listdir(path)
        # removing NaN columns
        nan_cols = []
        for i, time_bin in enumerate(trajectories):
            if time_bin != 'behavs':
                trajectory = loadmat(path + time_bin)
                trajectory = trajectory['trajectory'] 
                for i in range(trajectory.shape[1]):
                    if np.isnan(trajectory[:, i]).all():
                        nan_cols.append(i)

        # create entire trajectory
        nan_cols = np.asarray(nan_cols)
        if nan_cols.size > 0:
            if len(np.where(nan_cols==nan_cols[0])[0])*3 == len(nan_cols):
                all_trajectories = np.zeros(((trajectory.shape[0]*int(len(trajectories)/sampling_factor)), trajectory.shape[1]-len(nan_cols)))
        else:
            all_trajectories = np.zeros(((trajectory.shape[0]*int(len(trajectories)/sampling_factor)), trajectory.shape[1]-len(nan_cols)))

        for i, time_bin in enumerate(trajectories):
            if time_bin != 'behavs':
                trajectory = loadmat(path + time_bin)
                trajectory = trajectory['trajectory'] 
                if nan_cols.size > 0:
                    trajectory = np.delete(trajectory, nan_cols, 1)
                idx = a*trajectory.shape[0]
                if i % sampling_factor == 0 and sampled_trajectories.shape[0]-idx >= trajectory.shape[0]:
                    all_trajectories[idx:idx+trajectory.shape[0], 0:all_trajectories.shape[1]] = trajectory
                    a+=1
        
        # convert nan to number when not it is a sparse recurrence (not an entire COLUMN)
        all_trajectories = np.nan_to_num(all_trajectories)
        lengths.append(all_trajectories.shape[0])

        # append trajectory to all trajectories
        if n==0:
            rats = all_trajectories
        if n>0:
            rats = np.concatenate((rats, all_trajectories), axis=0)

        print(rats.shape)

traj_5
traj_3
traj_4
traj_1
(432000, 57)


## K-S Test same dimensions

In [None]:
def probability(*argv):
    '''
    input: 
        - 1D sequence of rv observations
    return: 
        - probability vector
    '''
    n_args = len(argv)
    if n_args == 1:
        sequence = argv[0]
        decimals = 1
        
    if n_args == 2:
        sequence = argv[0]
        decimals = argv[1]
    
    if len(sequence.shape) > 1 and (sequence.shape[0] < sequence.shape[1]):
        sequence = np.transpose(sequence)
    
    # round input sequence to avoid sparse probability vector
    sequence = np.round(sequence, decimals)
    unique = np.unique(sequence, axis=0)
    n_unique = len(unique)

    # fill probability vector
    prob_vector = np.zeros((n_unique, 1))
    for row in sequence:
        if len(sequence.shape) > 1:
            occurrences = len(np.where(np.all(np.isclose(sequence, row), axis=1))[0])
            idx = np.where(np.all(np.isclose(unique, row), axis=1))[0][0]
        else:
            occurrences = len(np.where(np.isclose(sequence, row))[0])
            idx = np.where(np.isclose(unique, row))[0][0]
            
        if prob_vector[idx] == 0:
            prob_vector[idx] = occurrences/(sequence.shape[0])
            
    return prob_vector

In [6]:
# probability vectors calculation
print("CALCULATING PROBABILITY VECTORS")
dims = np.arange(0, rats.shape[1], 3)
which = np.zeros((1, len(dims)))
n_rats = len(lengths)
for dim in dims:
    for i in range(n_rats):
        sequence = rats[0:lengths[i], dim]
        prob_vector = probability(sequence)
        print(np.sum(prob_vector))
        if i == 0:
            probs = prob_vector
        else:
            probs = np.concatenate((probs, prob_vector), axis=1)

    # fill ks matrix 
    print("CALCULATING K-S MATRIX")
    # "ks_matrix" is quadratic and symmetric (A=A')
    ks_matrix = np.zeros((n_rats, n_rats))
    significance_level = 0.01
    for ii in range(n_rats):
        for jj in range(n_rats):
            _, p_value = stats.ks_2samp(probs[:, ii], probs[:, jj])
            if p_value<significance_level:
                ks_matrix[ii, jj] = 0
            elif p_value>=significance_level:
                ks_matrix[ii, jj] = 1
    which[dim] = np.all(ks_matrix == ks_matrix[0])

print(trajectories)
print(ks_matrix)

CALCULATING PROBABILITY VECTORS
1.0
1.0
1.0
1.0
CALCULATING K-S MATRIX
['traj_5', 'traj_3', 'traj_4', 'traj_2', 'traj_1']
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
