In [1]:
%load_ext autoreload
%autoreload 2
import glob
import numpy as np
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm, trange
from sglm.models import sglm
from sglm.features import gen_signal_df as gsd
from sglm.features import build_features as bf
from sglm.features import gen_signal_df as gsd
from sglm.features import build_features as bf

y_col_lst_all = ['gACH', 'rDA', 'gDA', 'Ch5', 'Ch6', 'GP_1', 'GP_2', 'GP_5', 'GP_6', 'SGP_1', 'SGP_2', 'SGP_5', 'SGP_6']

# Load Signal Data
signal_files = glob.glob(f'../../data/raw/GLM_SIGNALS_WT61_*')
signal_files += glob.glob(f'../../data/raw/GLM_SIGNALS_WT63_*')
signal_files += glob.glob(f'../../data/raw/GLM_SIGNALS_WT64_*')
ignore_files = [
                'WT61_10152021',
                'WT61_10082021'
                ]
for ign in ignore_files:
    signal_files = [_ for _ in signal_files if ign not in _]

table_files = [_.replace('GLM_SIGNALS', 'GLM_TABLE') for _ in signal_files]

channel_definitions = {
        ('WT61',): {'Ch1': 'gACH', 'Ch2': 'rDA'},
        ('WT64',): {'Ch1': 'gACH', 'Ch2': 'empty'},
        ('WT63',): {'Ch1': 'gDA', 'Ch2': 'empty'},
    }
channel_assignments = bf.get_rename_columns_by_file(signal_files, channel_definitions)

for file_num in trange(len(signal_files)):

    ## Load Table Data
    # signal_fn = signal_files[0]
    # table_fn = table_files[0]

    signal_path = signal_files[file_num]
    table_path = table_files[file_num]

    signal_fn = signal_files[file_num].split('/')[-1]
    table_fn = table_files[file_num].split('/')[-1]

    signal_filename_out = signal_fn.replace('GLM_SIGNALS', 'GLM_SIGNALS_INTERIM').replace('txt', 'csv')
    table_filename_out = table_fn.replace('GLM_TABLE', 'GLM_TABLE_INTERIM').replace('txt', 'csv')

    signal_path_out = f'../../data/interim/{signal_filename_out}'
    table_path_out = f'../../data/interim/{table_filename_out}'


    signal_df = pd.read_csv(signal_path)
    table_df = pd.read_csv(table_path)

    signal_df, table_df = gsd.generate_signal_df(signal_path,
                                            table_path,
                                            # signal_filename_out=f'../../data/interim/{signal_filename_out}',
                                            # table_filename_out=f'../../data/interim/{table_filename_out}'
                                            )

    signal_df = signal_df[signal_df['nTrial'] > 0].fillna(0)

    # Break down Preprocess Lynne into component parts

    # Rename Columns
    signal_df = bf.rename_consistent_columns(signal_df)

    for y_col in y_col_lst_all:
        if y_col not in signal_df.columns:
            signal_df[y_col] = np.nan
            continue

    # print(channel_assignments.keys())
    # print(signal_fn)
    if signal_fn in channel_assignments:
        signal_df = signal_df.rename(channel_assignments[signal_fn], axis=1)

    ## Set Full Trial Reward Flags
    signal_df['r_trial'] = (signal_df.groupby('nTrial')['photometrySideInIndexr'].transform(np.sum) > 0) * 1.0
    signal_df['nr_trial'] = (signal_df.groupby('nTrial')['photometrySideInIndexnr'].transform(np.sum) > 0) * 1.0

    ## Define Side Rewarded / Unrewarded Flags
    signal_df = bf.set_port_entry_exit_rewarded_unrewarded_indicators(signal_df)

    ## Define Side Agnostic Events
    signal_df = bf.define_side_agnostic_events(signal_df)

    # print('Percent of Data in ITI:', (df['nTrial'] == df['nEndTrial']).mean())

    signal_df['spnrOff'] = ((signal_df['spnr'] == 1)&(signal_df['photometrySideInIndex'] != 1)).astype(int)
    signal_df['spxrOff'] = ((signal_df['spxr'] == 1)&(signal_df['photometrySideOutIndex'] != 1)).astype(int)
    spnnrOff_a = ((signal_df['spnnr'] == 1)&(signal_df['photometrySideInIndex'] != 1)).astype(int)
    spxnrOff_a = ((signal_df['spxnr'] == 1)&(signal_df['photometrySideOutIndex'] != 1)).astype(int)

    # If we have something listed as a rewarded "off" side entry labeled in the table as a side exit... it means it was a fast "out-in".
    # The latter "in" should be considered an unrewarded side port "off" entry.
    dualism_exen = ((signal_df['spnrOff'] == 1)&(signal_df['photometrySideOutIndex'] == 1)).astype(int)

    # Unrewarded side port entries should be the combination of those simply identified by checking spnnr & the table labels +
    # the dualism defined immediately prior. Then those dualism examples should be remoed from the "off" rewarded entries.
    signal_df['spnnrOff'] = spnnrOff_a + dualism_exen
    signal_df['spnrOff'] = signal_df['spnrOff'] - dualism_exen

    signal_df['spxnrOff'] = spxnrOff_a






    
    signal_df['cpnOff'] = ((signal_df['cpn'] == 1)&(signal_df['photometryCenterInIndex'] != 1)).astype(int)
    signal_df['cpxOff'] = ((signal_df['cpx'] == 1)&(signal_df['photometryCenterOutIndex'] != 1)).astype(int)
    # spnnrOff_a = ((signal_df['cpnOff'] == 1)&(signal_df['photometryCenterInIndex'] != 1)).astype(int)
    # spxnrOff_a = ((signal_df['cpxOff'] == 1)&(signal_df['photometryCenterOutIndex'] != 1)).astype(int)

    # # If we have something listed as a rewarded "off" side entry labeled in the table as a side exit... it means it was a fast "out-in".
    # # The latter "in" should be considered an unrewarded side port "off" entry.
    # dualism_exen = ((signal_df['cpnOff'] == 1)&(signal_df['photometryCenterOutIndex'] == 1)).astype(int)

    # # Unrewarded side port entries should be the combination of those simply identified by checking spnnr & the table labels +
    # # the dualism defined immediately prior. Then those dualism examples should be remoed from the "off" rewarded entries.
    # signal_df['spnnrOff'] = spnnrOff_a + dualism_exen
    # signal_df['spnrOff'] = signal_df['spnrOff'] - dualism_exen

    # signal_df['spxnrOff'] = spxnrOff_a


    if signal_path_out:
        signal_df.to_csv(signal_path_out, index_label='index')
    if table_path_out:
        table_df.to_csv(table_path_out, index_label='index')
    



('WT61',)
> GLM_SIGNALS_WT61_10042021.txt
> GLM_SIGNALS_WT61_10062021.txt
> GLM_SIGNALS_WT61_10132021.txt
> GLM_SIGNALS_WT61_10182021.txt
> GLM_SIGNALS_WT61_10112021.txt
('WT64',)
> GLM_SIGNALS_WT64_11122021.txt
> GLM_SIGNALS_WT64_11102021.txt
> GLM_SIGNALS_WT64_11082021.txt
> GLM_SIGNALS_WT64_11182021.txt
> GLM_SIGNALS_WT64_11222021.txt
> GLM_SIGNALS_WT64_11162021.txt
('WT63',)
> GLM_SIGNALS_WT63_11122021.txt
> GLM_SIGNALS_WT63_11102021.txt
> GLM_SIGNALS_WT63_11082021.txt
> GLM_SIGNALS_WT63_11182021.txt
> GLM_SIGNALS_WT63_11222021.txt
> GLM_SIGNALS_WT63_11162021.txt


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17.0), HTML(value='')))

# of iterations 2 — Final max amount of duplicated Center Out Indices: 1
# of iterations 3 — Final max amount of duplicated Center Out Indices: 1
# of iterations 3 — Final max amount of duplicated Center Out Indices: 1
# of iterations 4 — Final max amount of duplicated Center Out Indices: 1
# of iterations 2 — Final max amount of duplicated Center Out Indices: 1
# of iterations 7 — Final max amount of duplicated Center Out Indices: 1
# of iterations 6 — Final max amount of duplicated Center Out Indices: 1
# of iterations 5 — Final max amount of duplicated Center Out Indices: 1
# of iterations 5 — Final max amount of duplicated Center Out Indices: 1
# of iterations 3 — Final max amount of duplicated Center Out Indices: 1
# of iterations 4 — Final max amount of duplicated Center Out Indices: 1
# of iterations 4 — Final max amount of duplicated Center Out Indices: 1
# of iterations 4 — Final max amount of duplicated Center Out Indices: 1
# of iterations 3 — Final max amount of duplicated 

In [2]:
# # df['spnnr'] = ((df['spnnr'] == 1)&(df['photometrySideInIndex'] != 1)).astype(int)
# # df['spxnr'] = ((df['spxnr'] == 1)&(df['photometrySideOutIndex'] != 1)).astype(int)

# X_cols = [_ for _ in X_cols_all if _ not in left_out]

# if len(leave_one_out_list) > 1:
#     run_id = f'{prefix}_{fn}_{y_col}_drop={"_".join(left_out)}'
# else:
#     run_id = f'{prefix}_{fn}_{y_col}'

# dfrel = df.copy()

# dfrel, X_cols_sftd = lpp.timeshift_vals(dfrel, X_cols, neg_order=neg_order, pos_order=pos_order)

# dfrel_setup, dfrel_holdout = holdout_splits(dfrel,
#                                             id_cols=['nTrial'],
#                                             perc_holdout=pholdout)
# dfrel_setup, dfrel_holdout = dfrel_setup.copy(), dfrel_holdout.copy()

# kfold_cv_idx = sglm_ez.cv_idx_by_trial_id(dfrel_setup,
#                                           trial_id_columns=['nTrial'],
#                                           num_folds=folds,
#                                           test_size=pgss)

# prediction_X_cols = [_ for _ in X_cols if _ not in ['nTrial']]
# prediction_X_cols_sftd = [_ for _ in X_cols_sftd if _ not in ['nTrial']]

# X_setup = get_x(dfrel_setup, prediction_X_cols_sftd, keep_rows=None)
# y_setup = get_y(dfrel_setup, y_col, keep_rows=None)
# X_setup_noiti = get_x(dfrel_setup, prediction_X_cols_sftd, keep_rows=dfrel_setup['wi_trial_keep'])
# y_setup_noiti = get_y(dfrel_setup, y_col, keep_rows=dfrel_setup['wi_trial_keep'])
# best_score, best_score_std, best_params, best_model, cv_results = sglm_ez.simple_cv_fit(X_setup, y_setup, kfold_cv_idx, glm_kwarg_lst, model_type='Normal', verbose=0, score_method=score_method)

# sglm_ez.print_best_model_info(X_setup, best_score, best_params, best_model, start)

# X_holdout_witi = get_x(dfrel_holdout, prediction_X_cols_sftd, keep_rows=None)
# y_holdout_witi = get_y(dfrel_holdout, y_col, keep_rows=None)
# X_holdout_noiti = get_x(dfrel_holdout, prediction_X_cols_sftd, keep_rows=dfrel_holdout['wi_trial_keep'])
# y_holdout_noiti = get_y(dfrel_holdout, y_col, keep_rows=dfrel_holdout['wi_trial_keep'])
# glm, holdout_score, holdout_neg_mse_score = sglm_ez.training_fit_holdout_score(X_setup, y_setup, X_holdout_noiti, y_holdout_noiti, best_params)

# dfrel['pred'] = glm.predict(dfrel[prediction_X_cols_sftd])
# dfrel_setup['pred'] = glm.predict(dfrel_setup[prediction_X_cols_sftd])
# dfrel_holdout['pred'] = glm.predict(dfrel_holdout[prediction_X_cols_sftd])

# # Collect
# results_dict[f'{run_id}'] = {'holdout_score':holdout_score,
#                             'holdout_neg_mse_score':holdout_neg_mse_score,
#                             'best_score':best_score,
#                             'best_params':best_params,
#                             'all_models':sorted([(_['cv_R2_score'],
#                                                     _['cv_mse_score'],
#                                                     sglm_ez.calc_l1(_['cv_coefs']),
#                                                     sglm_ez.calc_l2(_['cv_coefs']),
#                                                     _['glm_kwargs']) for _ in cv_results['full_cv_results']], key=lambda x: -x[0])
#                             }

# X_cols_plot = prediction_X_cols
# X_cols_sftd_plot = prediction_X_cols_sftd

# # print('X_setup.columns', list(X_setup.columns), len(list(X_setup.columns)))
# # print('X_setup_noiti.columns', list(X_setup_noiti.columns), len(list(X_setup_noiti.columns)))
# # print('X_holdout_witi.columns', list(X_holdout_witi.columns), len(list(X_holdout_witi.columns)))
# # print('X_holdout_noiti.columns', list(X_holdout_noiti.columns), len(list(X_holdout_noiti.columns)))


# holdout_score_rnd = np.round(holdout_score, 4)
# best_beta_fn = f'{best_coeffs_folder}/{run_id}_best_{all_betas_basename}_R2_{holdout_score_rnd}.png'
# splt.plot_all_beta_coefs(glm.coef_, X_cols_plot,
#                                 X_cols_sftd_plot,
#                                 plot_width=4,
#                                 # plot_width=2,
#                                 y_lims=(-2.5, 2.5),
#                                 # filename=f'{fn}_coeffs.png',
#                                 binsize=54,
#                                 filename=best_beta_fn,
#                                 plot_name=f'Best Coeffs - {run_id} — {best_params}'
#                                 )

# best_beta_fn = f'{best_reconstruct_folder}/{run_id}_best_{avg_reconstruct_basename}_R2_{holdout_score_rnd}.png'




# splt.plot_avg_reconstructions_v2(dfrel_holdout,
# # splt.plot_avg_reconstructions_v2(dfrel,
#                             channel=y_col,
#                             binsize = 54,
#                             plot_width=4,
#                             min_time = -20,
#                             max_time = 30,
#                             min_signal = -3.0,
#                             max_signal = 3.0,
#                             file_name=best_beta_fn,
#                             title=f'Best Average Reconstruction - {run_id} — {best_params}'
#                             )

# for fitted_model_dict in (cv_results['full_cv_results']):
#     fitted_model = fitted_model_dict['model']
#     kwarg_info = "_".join([f"{_k}_{fitted_model_dict['glm_kwargs'][_k]}" for _k in fitted_model_dict["glm_kwargs"]])

#     model_coef = fitted_model.coef_
#     model_intercept = fitted_model.intercept_

#     std_name = f'{run_id}_{kwarg_info}'
#     np.save(f'{all_models_folder}/coeffs/{std_name}_{model_c_basename}.npy', model_coef)
#     np.save(f'{all_models_folder}/intercepts/{std_name}_{model_i_basename}.npy', model_intercept)
    
#     tmp_holdout_score = fitted_model.r2_score(X_holdout_noiti, y_holdout_noiti)

#     glmsave.append_fit_results(y_col, fitted_model_dict["glm_kwargs"], glm_model=fitted_model, dropped_cols=left_out,
#                             scores={
#                                 'tr_witi':fitted_model.r2_score(X_setup, y_setup),
#                                 'tr_noiti':fitted_model.r2_score(X_setup_noiti, y_setup_noiti),
#                                 'gss_witi':fitted_model_dict['cv_R2_score'],
#                                 'gss_noiti':None,
#                                 'holdout_witi':fitted_model.r2_score(X_holdout_witi, y_holdout_witi),
#                                 'holdout_noiti':fitted_model.r2_score(X_holdout_noiti, y_holdout_noiti)
#                             },
#                             gssids=kfold_cv_idx)

#     tmp = dfrel_holdout.set_index('nTrial').copy()
#     tmp['pred'] = fitted_model.predict(get_x(dfrel_holdout, prediction_X_cols_sftd, keep_rows=None))
#     tmp = lpp.get_first_entry_time(tmp)
#     tmp_y = get_y(dfrel_holdout, y_col, keep_rows=None).copy()
#     tmp_y.index = tmp.index
#     tmp[y_holdout_noiti.name] = tmp_y

#     tmp.to_csv(f'{all_data_folder}/{std_name}_{tmp_data_basename}.csv')

#     holdout_score_rnd = np.round(tmp_holdout_score, 4)


#     splt.plot_all_beta_coefs(fitted_model.coef_, X_cols_plot,
#                                     X_cols_sftd_plot,
#                                     plot_width=4,
#                                     y_lims=(-3.0, 3.0),
#                                     # filename=f'{fn}_coeffs.png',
#                                     binsize=54,
#                                     filename=f'{all_coeffs_folder}/{std_name}_{all_betas_basename}_R2_{holdout_score_rnd}.png',
#                                     plot_name=f'Coeffs by Timeshift - {run_id} — {kwarg_info}'
#                                     # plot_name=f'{fn} — {y_col} — {kwarg_info}'
#                                     )
    
#     plt.close('all')
# plt.close('all')


# glmsave.save()

In [3]:
# t = df[['nTrial', 'r_trial', 'nr_trial', 'photometrySideInIndexr', 'photometrySideInIndexnr', 'photometryCenterInIndex']]
# t.loc[2295:2345]

In [4]:
# df