## SVM hyperparameter optimization
Hyperparameter (C) optimization for linear support vector machine

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from scipy import stats
import statsmodels.api as sm

import pingouin as pg

import matplotlib as mpl
import matplotlib.pyplot as plt  

from datetime import date
import time

from joblib import Parallel, delayed

import h5py

In [2]:
# no top and right spines in all plots
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False

In [3]:
mother_path = Path('D:/Multi-modal project/')
save_path = mother_path / 'analysis' / 'result' / '7. Population decoding'
os.makedirs(save_path,exist_ok=True)

fig_path = mother_path / 'analysis' / 'result' / 'figures'

### Parameter setting

In [4]:
# colors for multimodal, vis-only, aud-only conditions
color = ['tab:purple','tab:blue','tab:red','tab:green']
color2 = ['cyan','magenta','gray']
linestyle = ['-',':']

today = str(date.today())

### Data preparation

In [5]:
cell_path = mother_path/'analysis'/'result'/'zFR export'/'13-Jun-2023'
cell_list = os.listdir(cell_path)

# load hdf5 files containing shuffled results
data_path = mother_path /'analysis'/'result'/'3. ANOVA'/'2023-07-10'
data2_path = mother_path /'analysis'/'result'/'3.1 Multiple linear regression'/'2023-07-10'
f = h5py.File(data_path/'2023-07-10_ANOVA_result.hdf5','r')
f2 = h5py.File(data2_path/'2023-07-10_multiple_regression_result.hdf5','r')

In [6]:
def h5_to_df(f):
    keys = list(f.keys())
    attributes = {}
    
    for key in keys:
        attributes[key] = dict(f[key].attrs.items())
    
    df = pd.DataFrame.from_dict(attributes, orient='index')
    df.index.name = 'key'
    df = df.reset_index()
    
    return df

In [7]:
df = h5_to_df(f)
df = df[df['region']=='PER']
df.reset_index(inplace=True, drop=True)

for key in df['key']:
    if df.loc[df['key']==key, 'object cell'].item() == 0:
        df.loc[df['key']==key, 'category'] = 'none'
    else:
        if f2[key].attrs['response cell'] == 0:
            df.loc[df['key']==key, 'category'] = 'object'
        else:
            df.loc[df['key']==key, 'category'] = 'response'

### Check the number of neurons available for population analysis

In [8]:
rat_list = np.sort(df['rat'].unique())
print(rat_list)

['600' '602' '640' '647' '654' '679' '699']


In [9]:
# none cell + object cell (response cell excluded)
cell_array = np.zeros((len(rat_list),int(df['session'].max())))
for i in range(len(df)):
    rat = df.loc[i, 'rat']
    rat_id = np.where(rat_list==rat)[0][0]
    ss = int(df.loc[i, 'session'])

    if df.loc[i,'category'] != 'response':
        cell_array[rat_id, ss-1] += 1

print('===== Object cells + None =====')
print(cell_array)

===== Object cells + None =====
[[ 8. 11.  3. 11.  7.  0.  0.]
 [26. 19. 28. 16. 16.  0.  0.]
 [10. 13.  9.  4.  5.  0.  0.]
 [ 0.  0.  0. 15. 10.  8.  4.]
 [ 7.  4.  0.  0.  1.  0.  0.]
 [ 0.  2.  2.  3.  0.  1.  0.]
 [ 9.  9. 16. 15. 17.  7.  0.]]


In [10]:
# only object cell (response cell, none cell excluded)
cell_array = np.zeros((len(rat_list),int(df['session'].max())))
for i in range(len(df)):
    rat = df.loc[i, 'rat']
    rat_id = np.where(rat_list==rat)[0][0]
    ss = int(df.loc[i, 'session'])
    
    if df.loc[i,'category'] == 'object':
        cell_array[rat_id, ss-1] += 1

print('====== Only object cells ======')
print(cell_array)

[[ 5.  6.  2.  4.  5.  0.  0.]
 [15.  6. 10.  5.  8.  0.  0.]
 [ 7.  9.  4.  3.  3.  0.  0.]
 [ 0.  0.  0. 10.  4.  4.  4.]
 [ 3.  2.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  1.  0.  0.  0.]
 [ 4.  3.  6.  3.  8.  3.  0.]]


In [11]:
num_obj = np.sum(cell_array,axis=1)
for i in range(len(rat_list)):
    print(f'LE{rat_list[i]}: {int(num_obj[i])}')

LE600: 22
LE602: 44
LE640: 26
LE647: 22
LE654: 6
LE679: 2
LE699: 27


---

## Support vector machine

In [12]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [50]:
def subsample(df, rat, num_trial, bin_size, shuffle):
    scaler = StandardScaler()
    
    num_bin = int(90/bin_size)
    object_cell_list = df.loc[(df['rat']==rat)&(df['category']=='object'),'key'].to_numpy()
    
    # M by N array of subsampled firing rates
    # M = stimulus condition * number of subsampled trials
    # N = number of cells * number of time bins
    X = np.zeros((6*num_trial, len(object_cell_list)*num_bin))
    
    for i,cell_key in enumerate(object_cell_list):
    
        # load trial-by-trial firing rate data of a cell
        key = cell_key.zfill(4)
        cell_filename = list(cell_path.glob(f'{key}*.csv'))
        df_cell = pd.read_csv(cell_filename[0])
        df_cell.drop(df_cell[df_cell.Correctness==0].index,inplace=True)
        df_cell.reset_index(inplace=True,drop=True)
        
        # shuffling stimulus condition for permutation test
        if shuffle:
            cond_array = df_cell[['Type','RWD_Loc']].to_numpy()
            shuffled_cond_array = np.random.permutation(cond_array)
            df_cell[['Type','RWD_Loc']] = shuffled_cond_array
        
        # goal location (left or right) of each object condition
        boy_goal = df_cell.loc[df_cell['Visual']=='Boy','RWD_Loc'].values[0]
        egg_goal = df_cell.loc[df_cell['Visual']=='Egg','RWD_Loc'].values[0]

        # stimulus condition (identity x modality)
        cond = [(df_cell['Type']=='Multimodal')&(df_cell['RWD_Loc']==boy_goal),
                (df_cell['Type']=='Visual')&(df_cell['RWD_Loc']==boy_goal),
                (df_cell['Type']=='Auditory')&(df_cell['RWD_Loc']==boy_goal),
                (df_cell['Type']=='Multimodal')&(df_cell['RWD_Loc']==egg_goal),
                (df_cell['Type']=='Visual')&(df_cell['RWD_Loc']==egg_goal),
                (df_cell['Type']=='Auditory')&(df_cell['RWD_Loc']==egg_goal)]
    
        fr_id = df_cell.columns.get_loc('Var10')  # get the index of the first firing rate column
        FR = df_cell.iloc[:,fr_id:].to_numpy()    # convert firing rate data to numpy array
    
        # recalculate mean firing rates of each time bin
        FR_bin = np.zeros((len(df_cell),num_bin))
        for b in range(num_bin):
            FR_bin[:,b] = FR[:,b*bin_size:(b+1)*bin_size].mean(axis=1)
        
        for c in range(len(cond)):
            # subsample n trials from each stimulus condition
            subsampled_trial = df_cell[cond[c]].sample(num_trial).index
            # firing rates of subsampled trials
            FR_sub = FR_bin[subsampled_trial,:]
            # add subsampled firing rate data to the result array
            X[num_trial*c:num_trial*(c+1), num_bin*i:num_bin*(i+1)] = FR_sub
    
    X_norm = scaler.fit_transform(X)
        
    return X_norm

### Hyperparameter optimization
Using GridSearchCV, find the best C parameter in linear support vector machine

In [15]:
# pre-determined hyperparameter parameter list
#params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},
#               {'kernel': ['linear'], 'C': [1e-5,1e-4,10e-3,10e-2,10e-1,1,10,100,1000,10000]}]
params_grid = [{'kernel': ['linear'], 'C': [1e-8,1e-7,1e-6,1e-5,1e-4,10e-3,10e-2,10e-1,1]}]
grid_search = GridSearchCV(svm.SVC(),params_grid,cv=5,scoring='accuracy')

In [16]:
%%time

rat_list = ['600', '602', '640', '647', '699']

num_iter = 1000
Y = np.repeat(np.arange(6), 5)

best_c = np.zeros((num_iter, len(rat_list)))
best_score = np.zeros((num_iter, len(rat_list)))

for i,rat in enumerate(rat_list):
    loop_start = time.time()
    for iter_run in range(num_iter):
        X = subsample(df, rat, 5, 10)
        grid_search.fit(X, Y)
        best_c[iter_run,i] = grid_search.best_estimator_.C
        best_score[iter_run,i] = grid_search.best_score_
    loop_end = time.time()
    loop_time = divmod(loop_end-loop_start,60)
    print(f'LE{rat} completed /// Elapsed time: {int(loop_time[0])} min {loop_time[1]:.2f} sec')

LE600 completed /// Elapsed time: 5 min 4.32 sec
LE602 completed /// Elapsed time: 9 min 13.74 sec
LE640 completed /// Elapsed time: 6 min 51.70 sec
LE647 completed /// Elapsed time: 6 min 7.45 sec
LE699 completed /// Elapsed time: 6 min 44.82 sec
Wall time: 34min 2s


In [33]:
for i in range(len(rat_list)):
    print(f'----- LE{rat_list[i]} result -----')
    for j in params_grid[0]['C']:
        n = sum(best_c[:,i]==j)
        print(f'C={j}: {n}')
    print('===========================')

----- LE600 result -----
C=1e-08: 520
C=1e-07: 0
C=1e-06: 0
C=1e-05: 0
C=0.0001: 0
C=0.01: 478
C=0.1: 2
C=1.0: 0
C=1: 0
----- LE602 result -----
C=1e-08: 440
C=1e-07: 0
C=1e-06: 0
C=1e-05: 0
C=0.0001: 0
C=0.01: 560
C=0.1: 0
C=1.0: 0
C=1: 0
----- LE640 result -----
C=1e-08: 435
C=1e-07: 0
C=1e-06: 0
C=1e-05: 0
C=0.0001: 0
C=0.01: 564
C=0.1: 1
C=1.0: 0
C=1: 0
----- LE647 result -----
C=1e-08: 459
C=1e-07: 0
C=1e-06: 0
C=1e-05: 0
C=0.0001: 0
C=0.01: 507
C=0.1: 34
C=1.0: 0
C=1: 0
----- LE699 result -----
C=1e-08: 497
C=1e-07: 0
C=1e-06: 0
C=1e-05: 0
C=0.0001: 0
C=0.01: 503
C=0.1: 0
C=1.0: 0
C=1: 0
