this tries to quantify the amount of information in (0,100), (0,500), (400,500) sections of Gaya data on tang images.

The methodology is similar to the one in the eLife sparse coding paper, using cross validation across trials.

code adapted from https://github.com/leelabcnbc/sparse-coding-elife2018/blob/master/decoding/decoder_aux.py

In [27]:
import numpy as np
from itertools import product

In [28]:
from thesis_v2.data.prepared.gaya import get_neural_data as get_neural_data_gaya

In [29]:
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
from sklearn.metrics import accuracy_score

In [30]:
def get_data(start_offset, end_offset, normalize):
    neural_data = get_neural_data_gaya(dataset='tang',return_raw=True,start_offset=start_offset,end_offset=end_offset)
    # for simplicity, I will just take first 8 trials
    neural_data = np.asarray([x[:8] for x in neural_data])
    # (num_time, num_trial, num_neuron)
#     print(neural_data.shape)
    assert neural_data.shape == (2250, 8, 34)
    if normalize:
        mean_data = neural_data.mean(axis=(0,1))
        assert mean_data.shape == (34,)
        neural_data = neural_data/mean_data
    
    return neural_data

In [31]:
def get_flat_data_label_and_group(data_all):
    n_stim, max_trial, n_neuron = data_all.shape
    # make the labels and group in shape (n_stim, max_trial)
    # this is a rewrite of `get_n_trial_tang_data`
    # from <https://github.com/leelabcnbc/sparse-coding-tang/blob/master/sct_code_python/preprocessing.py>

    y_label = np.tile(np.arange(n_stim), max_trial)
    y_group = np.repeat(np.arange(max_trial), n_stim)
    # this follows the old way reshaping is done. essentially, first have all 1st trial data,
    # then all 2nd trial, then all 3rd trial, etc.
    x_flat = np.concatenate([data_all[:, i_trial, :] for i_trial in range(max_trial)], axis=0)

    return x_flat, y_label, y_group

In [32]:
def classification_wrapper_one_Xy(X, y, y_group, n_jobs=-1):
    # this only returns prediction.
    assert len(X) == len(y)
    cv_obj = _parse_cv_obj(X, y, y_group)
    classifier = NearestCentroid()
    # hack to change y. remember to do this before doing cv object. otherwise, stratified CV won't work.
    # then classify, return prediction results.
    y_pred = _classifier_return_pred(X, y, cv_obj, classifier,  n_jobs)
    return y_pred

def _classifier_return_pred(X, y, cv_obj, classifier, n_jobs):
    y_pred = cross_val_predict(classifier, X, y, cv=cv_obj, n_jobs=n_jobs)
    assert y.shape == y_pred.shape
    return y_pred

def _parse_cv_obj(X, y, y_group):
    # if y_group is not None, then use LeaveOneGroupOut
    # else, I will use StratifiedKFold
    cv_obj = LeaveOneGroupOut().split(X, y, y_group)
    return cv_obj

In [33]:
def everything_for_one(start_offset, end_offset, normalize):
    print(f'{start_offset}-{end_offset}, normalize {normalize}')
    data_all = get_data(start_offset,end_offset,normalize)
    x_flat, y_label, y_group = get_flat_data_label_and_group(data_all)
    y_pred = classification_wrapper_one_Xy(x_flat, y_label, y_group)
    s = accuracy_score(y_pred, y_label)
    assert s == (y_pred==y_label).mean()
    print(s)
    return s

In [34]:
import pandas as pd

In [38]:
def everything():
    df_all = []
    for (start, end), normalize in product([(0, 500), (0, 400), (100, 500), (0, 100), (100, 200), (200, 300), (300, 400), (400, 500)],
                                          [True, False]):
        score = everything_for_one(start, end, normalize)
        df_all.append(
            {
                'start': start,
                'end': end,
                'normalize': normalize,
                'accuracy': score,
            }
        )
    df = pd.DataFrame(df_all, columns=['start', 'end', 'normalize', 'accuracy'])
    return df

In [39]:
dataframe = everything()

0-500, normalize True
0.6485
0-500, normalize False
0.7176111111111111
0-400, normalize True
0.6212222222222222
0-400, normalize False
0.7049444444444445
100-500, normalize True
0.5489444444444445
100-500, normalize False
0.6134444444444445
0-100, normalize True
0.3262777777777778
0-100, normalize False
0.5516666666666666
100-200, normalize True
0.25483333333333336
100-200, normalize False
0.4062222222222222
200-300, normalize True
0.18266666666666667
200-300, normalize False
0.2807222222222222
300-400, normalize True
0.12527777777777777
300-400, normalize False
0.19361111111111112
400-500, normalize True
0.10355555555555555
400-500, normalize False
0.1455


In [40]:
dataframe
# 1. normalize is bad.
# 2. later responses have less info
# 3. full response has most info.

Unnamed: 0,start,end,normalize,accuracy
0,0,500,True,0.6485
1,0,500,False,0.717611
2,0,400,True,0.621222
3,0,400,False,0.704944
4,100,500,True,0.548944
5,100,500,False,0.613444
6,0,100,True,0.326278
7,0,100,False,0.551667
8,100,200,True,0.254833
9,100,200,False,0.406222
