# Purpose:
- Develop lasso GLM for behavior data (whisker feature and angle)
- using sklearn or pyglmnet?

In [1]:
from pyglmnet import GLM
import numpy as np
import pandas as pd
from importlib import reload
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('default')
import xarray as xr
from utils import lda_angle_discrim as lad
from sklearn.linear_model import LogisticRegression

base_dir = Path(r'E:\TPM\JK\h5')
results_dir = base_dir / 'results'
wf_dir = results_dir / 'touch_whisker_features'
b_dir = Path(r'E:\TPM\JK\SoloData')

expert_mice_df = pd.read_csv(base_dir / 'expert_mice.csv', index_col=0)
use_mice_df = expert_mice_df.loc[expert_mice_df['depth_matched'].astype(bool) & 
                                 ~expert_mice_df['processing_error'].astype(bool) &
                                 ((expert_mice_df.session_type == 'training') |
                                  (expert_mice_df.session_type.str.contains('test')))]
use_volume_df = use_mice_df.query('plane in [1, 5]')
use_volume_df.loc[:, 'volume'] = use_volume_df['plane'].apply(lambda x: 1 if x==1 else 2)
training_volume_df = use_volume_df.query('session_type == "training"')
remove_ind = training_volume_df.query('mouse==27 and session=="15"')
training_volume_df = training_volume_df.drop(remove_ind.index)
remove_ind = training_volume_df.query('mouse==36 and session=="9"')
training_volume_df = training_volume_df.drop(remove_ind.index)

mice = [25,27,30,36,39,52]
test_sessions = [[4,19], [3,8], [3,21], [1,17], [1,23], [3,21]]
naive_sessions = [10, 4, 11, 6, 6, 11]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [2]:
def get_x_y_for_whisker_feature_angle_discrim(wf_dir, b_dir, mouse, session,
                                              touch_window='before_answer'):
    whisker_feature_names = ['theta_onset', 'phi_onset', 'kappaH_onset', 'kappaV_onset',
    'arcLength_onset', 'touch_count', 'delta_theta', 'delta_phi',
    'delta_kappaH', 'delta_kappaV', 'touch_duration', 'slide_distance']

    wf_fn = wf_dir / f'JK{mouse:03}S{session:02}_touch_whisker_features.pkl'
    b_fn = b_dir / f'JK{mouse:03}/JK{mouse:03}S{session:02}_behavior.pkl'
    
    wf_df = pd.read_pickle(wf_fn)
    b_df = pd.read_pickle(b_fn)
    
    if touch_window == 'before_answer':
        wf_df = wf_df.groupby('trialNum').apply(
            lambda x: x.query('touch_offset_time < answer_lick_time')).reset_index(
                drop=True)
    elif touch_window == 'after_answer':
        wf_df = wf_df.groupby('trialNum').apply(
            lambda x: x.query('pole_onset_time >= answer_lick_time')).reset_index(
                drop=True)
    elif touch_window == 'all':
        pass
    else:
        raise ValueError('Invalid touch_window')
    
    wf_mean = wf_df.groupby('trialNum').mean()
    wf_mean['touch_count'] = wf_df.groupby('trialNum').size()
    
    b_df['outcome'] = b_df.apply(lambda x: _get_outcome(x), axis=1)
    b_df['pole_angle'] = b_df['servoAngle']
    
    wf_mean = wf_mean.merge(b_df[['outcome','pole_angle']], on='trialNum')
    # remove rows with NaN values
    wf_mean = wf_mean.dropna()
    
    values = wf_mean.loc[:,whisker_feature_names].values
    values = (values - values.mean(axis=0)) / values.std(axis=0)

    X = xr.DataArray(values,
                    dims=['trialNum', 'whisker_feature'],
                    coords={'trialNum': wf_mean.index.values,
                            'whisker_feature': whisker_feature_names})
    y_temp = wf_mean['pole_angle'].values
    angles = np.sort(np.unique(y_temp))
    y = np.array([np.where(angles==a)[0][0] for a in y_temp])
    
    return X, y


def _get_outcome(x):
    if x.choice == 'm':
        return 'miss'
    elif x.choice == 'l':
        return 'correct' if x.trialType[0] == 'l' else 'wrong'
    elif x.choice == 'r':
        return 'correct' if x.trialType[0] == 'r' else 'wrong'
    else:
        return 'other'

In [45]:
mi = 0
mouse = mice[mi]
sessions = np.sort([int(s) for s in use_mice_df.query('mouse==@mouse and session_type=="training"')['session'].unique()])
si = 0
session = sessions[si]




In [63]:
num_splits = 4
lam_list = np.logspace(-2, 0, 10)
num_split_lam = 4
X, y = get_x_y_for_whisker_feature_angle_discrim(wf_dir, b_dir, mouse, session, 'before_answer')
splits_inds = lad.stratify_random_split(np.arange(len(y)), y, num_splits=num_splits)
split_accuracies = []
best_lam_list = []
coeffs = []
intercepts = []
for si in range(num_splits):
    train_inds = np.concatenate([splits_inds[i] for i in range(num_splits) if i != si])    
    test_inds = splits_inds[si]
    X_train = X[train_inds,:]
    y_train = y[train_inds]
    X_test = X[test_inds,:]
    y_test = y[test_inds]

    lam_splits_inds = lad.stratify_random_split(np.arange(len(y_train)), y_train,
                                            num_splits=num_split_lam)
    split_best_lam_list = []
    for lsi in range(num_split_lam):
        lam_train_inds = np.concatenate([lam_splits_inds[i] for i in range(num_split_lam) if i != lsi])
        X_train_lam = X_train[lam_train_inds,:]
        y_train_lam = y_train[lam_train_inds]
        X_test_lam = X_train[lam_splits_inds[lsi],:]
        y_test_lam = y_train[lam_splits_inds[lsi]]

        lam_accuracies = []
        for lam in lam_list:
            clf = LogisticRegression(penalty='l1', solver='liblinear', C=lam)
            clf.fit(X_train_lam, y_train_lam)
            y_pred = clf.predict(X_test_lam)
            lam_accuracies.append((y_pred == y_test_lam).mean())
        lam_accuracies = np.array(lam_accuracies)
        max_ind = np.argmax(lam_accuracies)
        best_lam = lam_list[max_ind]
        split_best_lam_list.append(best_lam)
    best_lam = np.mean(split_best_lam_list)
    best_lam_list.append(best_lam)
    clf = LogisticRegression(penalty='l1', solver='liblinear', C=best_lam)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    split_accuracies.append((y_pred == y_test).mean())
    coeffs.append(clf.coef_)
    intercepts.append(clf.intercept_)
mean_coeffs = np.mean(np.array(coeffs), axis=0)
mean_intercept = np.mean(np.array(intercepts), axis=0)
mean_accuracy = np.mean(split_accuracies)
classes = clf.classes_
clf = LogisticRegression(penalty='l1', solver='liblinear', C=np.mean(best_lam_list))
clf.classes_ = classes
clf.coef_ = mean_coeffs
clf.intercept_ = mean_intercept
y_pred = clf.predict(X)
final_accuracy = (y_pred == y).mean()

In [74]:
# feature importance
feature_importance = []
for feature in X.whisker_feature.values:
    X_temp = X.copy()
    X_temp.loc[:,feature] = 0
    y_pred_temp = clf.predict(X_temp)
    accuracy_temp = (y_pred_temp == y).mean()
    feature_importance.append((final_accuracy - accuracy_temp) / final_accuracy)


In [75]:
feature_importance = xr.DataArray(feature_importance, dims=['whisker_feature'], coords={'whisker_feature': X.whisker_feature.values})

In [79]:
def get_lasso_logistic_results(mouse, session, wf_dir, b_dir,
                               num_splits=4, num_split_lam=4,
                               lam_list=np.logspace(-2, 0, 10)):
    X, y = get_x_y_for_whisker_feature_angle_discrim(wf_dir, b_dir, mouse, session, 'before_answer')
    splits_inds = lad.stratify_random_split(np.arange(len(y)), y, num_splits=num_splits)
    split_accuracies = []
    best_lam_list = []
    coeffs = []
    intercepts = []
    lam_split_inds_list = []
    for si in range(num_splits):
        train_inds = np.concatenate([splits_inds[i] for i in range(num_splits) if i != si])    
        test_inds = splits_inds[si]
        X_train = X[train_inds,:]
        y_train = y[train_inds]
        X_test = X[test_inds,:]
        y_test = y[test_inds]

        lam_splits_inds = lad.stratify_random_split(np.arange(len(y_train)), y_train,
                                                num_splits=num_split_lam)
        lam_split_inds_list.append(lam_splits_inds)
        split_best_lam_list = []
        for lsi in range(num_split_lam):
            lam_train_inds = np.concatenate([lam_splits_inds[i] for i in range(num_split_lam) if i != lsi])
            X_train_lam = X_train[lam_train_inds,:]
            y_train_lam = y_train[lam_train_inds]
            X_test_lam = X_train[lam_splits_inds[lsi],:]
            y_test_lam = y_train[lam_splits_inds[lsi]]

            lam_accuracies = []
            for lam in lam_list:
                clf = LogisticRegression(penalty='l1', solver='liblinear', C=lam)
                clf.fit(X_train_lam, y_train_lam)
                y_pred = clf.predict(X_test_lam)
                lam_accuracies.append((y_pred == y_test_lam).mean())
            lam_accuracies = np.array(lam_accuracies)
            max_ind = np.argmax(lam_accuracies)
            best_lam = lam_list[max_ind]
            split_best_lam_list.append(best_lam)
        best_lam = np.mean(split_best_lam_list)
        best_lam_list.append(best_lam)
        clf = LogisticRegression(penalty='l1', solver='liblinear', C=best_lam)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        split_accuracies.append((y_pred == y_test).mean())
        coeffs.append(clf.coef_)
        intercepts.append(clf.intercept_)
    mean_coeffs = np.mean(np.array(coeffs), axis=0)
    mean_intercept = np.mean(np.array(intercepts), axis=0)
    mean_accuracy = np.mean(split_accuracies)
    classes = clf.classes_
    clf = LogisticRegression(penalty='l1', solver='liblinear', C=np.mean(best_lam_list))
    clf.classes_ = classes
    clf.coef_ = mean_coeffs
    clf.intercept_ = mean_intercept
    y_pred = clf.predict(X)
    final_accuracy = (y_pred == y).mean()

    # feature importance
    feature_importance = []
    for feature in X.whisker_feature.values:
        X_temp = X.copy()
        X_temp.loc[:,feature] = 0
        y_pred_temp = clf.predict(X_temp)
        accuracy_temp = (y_pred_temp == y).mean()
        feature_importance.append((final_accuracy - accuracy_temp) / final_accuracy)
    feature_importance = xr.DataArray(feature_importance,
                                      dims=['whisker_feature'],
                                      coords={'whisker_feature': X.whisker_feature.values})

    return final_accuracy, feature_importance, classes, mean_coeffs, mean_intercept, mean_accuracy, best_lam_list, splits_inds, lam_split_inds_list



In [81]:
final_accuracy, feature_importance, classes, mean_coeffs, mean_intercept, mean_accuracy, best_lam_list, splits_inds, lam_split_inds_list = \
    get_lasso_logistic_results(mouse, session, wf_dir, b_dir)