In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC  
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification  
from matplotlib import cm
from os.path import exists
import mat73


## Grid search linear SVC
* Search over penalties to do binary classification on whether stim was likely or unlikely

In [None]:
# basic info about model 
prob_split = '70_30'
afc = 2
coh = 'hi'

# which layer do we want? 
layer = 1

# averge fr over this time window post stimuluat
# this is unit of model time-steps
t_win = [ 200,-1 ]

# number of cv folds
n_cvs = 5 # performance goes up with fewer cvs? looked at 3

# store the accuracy
acc = np.full( ( n_cvs ), np.nan )

# penalties to eval
num_cgs = 30
Cs = np.logspace( -5,1,num_cgs )

# set up the grid
param_grid = { 'C': Cs, 'kernel': ['linear'] }

# define object - use a SVC that balances class weights (because they are biased, e.g. 70/30)
# note that can also specify cv folds here, but I'm doing it by hand below in a loop
grid = GridSearchCV( SVC(class_weight = 'balanced'),param_grid,refit=True,verbose=0 )

# load the data, e.g. 'out_70_30_2AFC_hi_coh.mat'
f_data = f'out_{prob_split}_{afc}AFC_{coh}_coh.mat'

# get the data from the layer that we want
# this is a [trial x time step x unit] matrix
data = mat73.loadmat(f_data)[f'fr{layer}']

# load the labels (one for each trial)
labs = mat73.loadmat(f_data)['labs']

In [92]:
# avg over time window
data = np.mean( data[ :,t_win[0]:t_win[1],: ], axis = 1 )

# get some info about structure of the data
tris = data.shape[0]             # number of trials
tri_ind = np.arange(0,tris)      # list from 0...tris
hold_out = int( tris / n_cvs )   # how many trials to hold out

# loop over cvs and do classification
for i in range(n_cvs):

    # trials to hold out as test set on this cv fold
    tst_ind = tri_ind[ i*hold_out : (i+1)*hold_out ]
    
    # index into the training data on this cv fold
    trn_ind = np.setdiff1d( tri_ind, tst_ind )

    # get the training data (X) and the training labels (y)
    # note that y is unbalanced unless prob is 50/50
    # todo: verify that SVC(class_weight = 'balanced')
    # is working as desired!
    # HK done: follows formula of n/ki
    X = data[trn_ind,:]
    y = labs[trn_ind]

    # fit the model
    grid.fit( X,y )

    # progress report
    print(f'CV: {i}, {grid.best_estimator_}')

    # get the test data (X) and the test labels (y)
    X_test = data[tst_ind, :]
    y_test = labs[tst_ind]

    # predict!
    acc[ i ] = grid.score( X_test,y_test )
        

print( np.mean( acc ) )


CV: 0, SVC(C=0.0030391953823131978, class_weight='balanced', kernel='linear')
CV: 1, SVC(C=0.004893900918477494, class_weight='balanced', kernel='linear')
CV: 2, SVC(C=0.0030391953823131978, class_weight='balanced', kernel='linear')
0.9494949494949495


In [86]:
# check class weights from SVC class_weight = balanced
best_svc = grid.best_estimator_
class_weights = best_svc.class_weight_
class_weights

array([0.70175439, 1.73913043])

In [87]:
# try it with manual class weights
# n/kni
lab1 = 100/(70*2)
lab2 = 100/(30*2)
custom_cw = {1: lab1, 2: lab2}
grid = GridSearchCV( SVC(class_weight = custom_cw),param_grid,refit=True,verbose=0 )

In [88]:
# get some info about structure of the data
tris = data.shape[0]             # number of trials
tri_ind = np.arange(0,tris)      # list from 0...tris
hold_out = int( tris / n_cvs )   # how many trials to hold out

In [89]:
for i in range(n_cvs):

    # trials to hold out as test set on this cv fold
    tst_ind = tri_ind[ i*hold_out : (i+1)*hold_out ]
    
    # index into the training data on this cv fold
    trn_ind = np.setdiff1d( tri_ind, tst_ind )

    # get the training data (X) and the training labels (y)
    # note that y is unbalanced unless prob is 50/50
    # todo: verify that SVC(class_weight = 'balanced')
    # is working as desired!
    X = data[trn_ind,:]
    y = labs[trn_ind]

    # fit the model
    grid.fit( X,y )

    # progress report
    print(f'CV: {i}, {grid.best_estimator_}')

    # get the test data (X) and the test labels (y)
    X_test = data[tst_ind, :]
    y_test = labs[tst_ind]

    # predict!
    acc[ i ] = grid.score( X_test,y_test )
        

print( np.mean( acc ) )

CV: 0, SVC(C=0.0030391953823131978,
    class_weight={1: 0.7142857142857143, 2: 1.6666666666666667},
    kernel='linear')
CV: 1, SVC(C=0.004893900918477494,
    class_weight={1: 0.7142857142857143, 2: 1.6666666666666667},
    kernel='linear')
CV: 2, SVC(C=0.0030391953823131978,
    class_weight={1: 0.7142857142857143, 2: 1.6666666666666667},
    kernel='linear')
CV: 3, SVC(C=6.2101694189156165,
    class_weight={1: 0.7142857142857143, 2: 1.6666666666666667},
    kernel='linear')
CV: 4, SVC(C=0.004893900918477494,
    class_weight={1: 0.7142857142857143, 2: 1.6666666666666667},
    kernel='linear')
0.9400000000000001
