In [203]:
import os
import pandas as pd
import random
from scipy.stats import zscore
path = "/Users/jorgerag/Documents/UCSD/courses/capstone/data/photometry_analog"
os.chdir(path)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [159]:
# Import data
beha_data = pd.read_csv("processed/analog_data.csv")
photo_data = pd.read_csv("processed/photometry_data.csv")

In [177]:
# Duration of lever press
def get_duration_lp(beh_dict):
    start_time = 0
    for i in range(0, len(beh_dict)):
        if i != len(beh_dict)-1:
            if beh_dict[i]['lp'] == 1 and start_time == 0:
                start_time = beh_dict[i]['timestamp']
            elif beh_dict[i+1]['lp'] == 0 and start_time != 0:
                beh_dict[i]['lp_start_time'] = start_time
                beh_dict[i]['lp_end_time'] = beh_dict[i+1]['timestamp']
                beh_dict[i]['lp_duration'] = beh_dict[i+1]['timestamp'] - start_time - 20 # correction found in the matlab code
                start_time = 0
    return beh_dict

# Inter Press Interval
def get_ipi(beh_dict):
    start_time = 0
    ipi = float('NaN')
    for i in range(0, len(beh_dict)):
        if i != len(beh_dict)-1:
            if beh_dict[i]['lp'] == 1 and beh_dict[i+1]['lp'] == 0 and start_time == 0:
                start_time = beh_dict[i+1]['timestamp']
                beh_dict[i]['ipi'] = ipi
                ipi = float('NaN')
            elif beh_dict[i]['lp'] == 1 and start_time != 0:
                ipi = beh_dict[i]["timestamp"] - start_time
                start_time = 0
    return beh_dict

# head entry duration
def get_he(beh_dict):
    start_he = 0
    he_duration = 0
    for i in range(0, len(beh_dict)):
        if i != len(beh_dict)-1:
            if "lp_duration" in beh_dict[i]:
                beh_dict[i]["past_he_duration"] = he_duration
                he_duration = 0
            elif beh_dict[i]['he'] == 1 and start_he == 0:
                start_he = beh_dict[i]['timestamp']
            elif beh_dict[i]['he'] == 0 and start_he != 0:
                he_duration = he_duration + (beh_dict[i]['timestamp'] - start_he)
                start_he = 0
    return beh_dict

# reward dummy
def get_rew(beh_dict):
    rew_dummy = 0
    for i in range(0, len(beh_dict)):
        if "lp_duration" in beh_dict[i]:
            beh_dict[i]["rew_dummy"] = rew_dummy
            rew_dummy = 0
        elif beh_dict[i]['rew'] == 1:
            rew_dummy = 1
    return beh_dict

# Get successful lp
def met_lp(x):
    if x["lp_duration"] >= x["threshold"]:
        return 1
    else:
        return 0

# Cleaning lp_duration
def lp_duration_clean(x):
    if x["lp_duration"] < 0:
        return 0
    else:
        return x["lp_duration"]

# Get gcamp before lp
def get_gcamp_total(model_dict):
    for i in range(0, len(model_dict)):
        if i == 0:
            gcamp_df = photo_data[photo_data["timestamp"] < model_dict[i]["timestamp"]]
            duration = float('NaN')
        else:
            gcamp_df = photo_data[(photo_data["timestamp"] <= model_dict[i]["timestamp"]) & (model_dict[i-1]["timestamp"] < photo_data["timestamp"])]
            duration = (model_dict[i]["timestamp"] - model_dict[i-1]["timestamp"])/1000
        model_dict[i]["gcamp_total"] = gcamp_df["gcamp"].sum()
        model_dict[i]["gcamp_total_per_sec"] = gcamp_df["gcamp"].sum() / duration
    return model_dict

# Get gcamp during lp
def get_gcamp_lp(model_dict):
    for i in range(0, len(model_dict)):
        gcamp_df = photo_data[(photo_data["timestamp"]>= model_dict[i]["lp_start_time"]) & (photo_data["timestamp"] <= model_dict[i]["lp_end_time"])]
        model_dict[i]["gcamp_lp"] = gcamp_df["gcamp"].sum()
        model_dict[i]["gcamp_lp_mod"] = gcamp_df["gcamp_mod"].sum()
        model_dict[i]["gcamp_lp_per_sec"] = gcamp_df["gcamp"].sum()/(model_dict[i]["lp_duration"]/1000)
        model_dict[i]["gcamp_lp_mod_per_sec"] = gcamp_df["gcamp_mod"].sum()/(model_dict[i]["lp_duration"]/1000)
    return model_dict

In [179]:
# Get unique subject, threshold and day combinations
unique_obj = beha_data.groupby(["subject", "threshold", "day"]).size().reset_index()
unique_obj = unique_obj.to_dict('records')

# Iterate over every sesion to create final dataframe
final_model_data = pd.DataFrame()
for elem in unique_obj:
    beh_df = beha_data[(beha_data['subject'] == elem["subject"]) & (beha_data['threshold'] == elem["threshold"]) & (beha_data['day'] == elem["day"])]
    photo_df = photo_data[(photo_data['subject'] == elem["subject"]) & (photo_data['threshold'] == elem["threshold"]) & (photo_data['day'] == elem["day"])]

    beh_dict = beh_df.to_dict('records')
    beh_dict = get_duration_lp(beh_dict)
    beh_dict = get_ipi(beh_dict)
    beh_dict = get_he(beh_dict)
    beh_dict = get_rew(beh_dict)

    model_df = pd.DataFrame(beh_dict)
    
    # Filter everything but lp 
    model_df = model_df.loc[pd.notna(model_df['lp_duration'])]
    # LP met
    model_df['lp_duration'] = model_df.apply(lp_duration_clean, axis = 1)
    model_df["lp_met"] = model_df.apply(met_lp, axis=1)

    model_dict = model_df.to_dict('records')
    #model_dict = get_gcamp_total(model_dict)
    model_dict = get_gcamp_lp(model_dict)
    model_df = pd.DataFrame(model_dict)
    model_df['order'] = range(1, len(model_df) + 1)
    model_df = model_df[["order", "subject", "threshold", "day", "lp_duration", "lp_met", "ipi", "past_he_duration", "rew_dummy",  "gcamp_lp", "gcamp_lp_per_sec", "gcamp_lp_mod", "gcamp_lp_mod_per_sec"]]
    final_model_data = pd.concat([final_model_data, model_df])


  model_dict[i]["gcamp_lp_per_sec"] = gcamp_df["gcamp"].sum()/(model_dict[i]["lp_duration"]/1000)
  model_dict[i]["gcamp_lp_mod_per_sec"] = gcamp_df["gcamp_mod"].sum()/(model_dict[i]["lp_duration"]/1000)
  model_dict[i]["gcamp_lp_per_sec"] = gcamp_df["gcamp"].sum()/(model_dict[i]["lp_duration"]/1000)
  model_dict[i]["gcamp_lp_mod_per_sec"] = gcamp_df["gcamp_mod"].sum()/(model_dict[i]["lp_duration"]/1000)


In [183]:
# Get zcores
final_model_data['gcamp_zscore'] = final_model_data.groupby(['subject', 'day']).gcamp_lp.transform(lambda x : zscore(x,ddof=1))
final_model_data['gcamp_mod_zscore'] = final_model_data.groupby(['subject', 'day']).gcamp_lp_mod.transform(lambda x : zscore(x,ddof=1))
final_model_data['gcamp_lp_per_sec_zscore'] = final_model_data.groupby(['subject', 'day']).gcamp_lp_per_sec.transform(lambda x : zscore(x,ddof=1))
final_model_data['gcamp_lp_mod_per_sec_zscore'] = final_model_data.groupby(['subject', 'day']).gcamp_lp_mod_per_sec.transform(lambda x : zscore(x,ddof=1))

In [186]:
final_model_data[['lp_duration', 'lp_met', 'gcamp_lp', 'gcamp_lp_per_sec', 'gcamp_zscore', 'gcamp_mod_zscore', 'gcamp_lp_per_sec_zscore', 'gcamp_lp_mod_per_sec_zscore']].corr()

Unnamed: 0,lp_duration,lp_met,gcamp_lp,gcamp_lp_per_sec,gcamp_zscore,gcamp_mod_zscore,gcamp_lp_per_sec_zscore,gcamp_lp_mod_per_sec_zscore
lp_duration,1.0,0.6014,0.8622,-0.0647,0.6354,0.6412,-0.0996,-0.1076
lp_met,0.6014,1.0,0.5482,-0.0603,0.5489,0.5543,-0.1083,-0.1147
gcamp_lp,0.8622,0.5482,1.0,0.0086,0.6206,0.6143,-0.0337,-0.0591
gcamp_lp_per_sec,-0.0647,-0.0603,0.0086,1.0,-0.0311,-0.0343,0.4729,0.4487
gcamp_zscore,0.6354,0.5489,0.6206,-0.0311,1.0,0.9944,-0.0165,-0.0409
gcamp_mod_zscore,0.6412,0.5543,0.6143,-0.0343,0.9944,1.0,-0.0427,-0.0452
gcamp_lp_per_sec_zscore,-0.0996,-0.1083,-0.0337,0.4729,-0.0165,-0.0427,1.0,0.9343
gcamp_lp_mod_per_sec_zscore,-0.1076,-0.1147,-0.0591,0.4487,-0.0409,-0.0452,0.9343,1.0


In [187]:
final_model_data.describe()

Unnamed: 0,order,subject,threshold,day,lp_duration,lp_met,ipi,past_he_duration,rew_dummy,gcamp_lp,gcamp_lp_per_sec,gcamp_lp_mod,gcamp_lp_mod_per_sec,gcamp_zscore,gcamp_mod_zscore,gcamp_lp_per_sec_zscore,gcamp_lp_mod_per_sec_zscore
count,9637.0,9637.0,9637.0,9637.0,9637.0,9637.0,8970.0,9637.0,9637.0,9637.0,9416.0,9637.0,9416.0,9637.0,9637.0,6738.0,6738.0
mean,526.5334,3646.0884,1600.0,3.2396,905.7302,0.1785,14115.8637,266.5503,0.1643,13011405.541,inf,10776646.5671,inf,0.0,0.0,0.0,-0.0
std,764.7624,474.5952,0.0,2.2248,1717.551,0.3829,53442.2633,1122.7471,0.3705,26388669.32,,21621198.543,,0.9983,0.9983,0.9977,0.9977
min,1.0,3201.0,1600.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1488491.8523,0.0,1488491.8523,-1.2999,-1.2999,-2.7975,-3.0015
25%,74.0,3316.0,1600.0,1.0,12.7552,0.0,73.6128,0.0,0.0,353959.0,12483590.4621,352837.0,12052781.4645,-0.4697,-0.4729,-0.475,-0.4631
50%,158.0,3317.0,1600.0,3.0,442.8992,0.0,1011.7376,0.0,0.0,5036619.0,18112213.9751,4542871.1,14848796.363,-0.1979,-0.1978,-0.1417,-0.1173
75%,520.0,4268.0,1600.0,5.0,1217.0176,0.0,7680.7136,0.0,0.0,15394611.0,30226113.5303,13217323.0,21872217.6141,0.0884,0.0926,0.1931,0.2177
max,2899.0,4269.0,1600.0,8.0,85735.5456,1.0,1330728.32,51026.3552,1.0,910719258.0,inf,910719258.0,inf,40.511,40.511,18.1634,18.2559


In [197]:
final_model_data.to_csv("processed/model_data.csv", index=False)

In [193]:
unique_obj_df = pd.DataFrame(unique_obj)[['subject', 'day']]

In [209]:
# Split to train and test
## Take one trial per subject randomly as test, the rest will be for training
random.seed(123)
test_trials = unique_obj_df.groupby('subject').apply(lambda x: x.sample(1)).reset_index(drop=True)
test_trials

Unnamed: 0,subject,day
0,3201,6
1,3203,2
2,3204,6
3,3315,3
4,3316,6
5,3317,3
6,4268,3
7,4269,7


In [210]:
#Splitting 
keys = list(test_trials.columns.values)
i1 = final_model_data.set_index(keys).index
i2 = test_trials.set_index(keys).index

train_df = final_model_data[~i1.isin(i2)]
test_df = final_model_data[i1.isin(i2)]


In [211]:
len(train_df) + len(test_df) == len(final_model_data)

True

In [212]:
len(test_df)/len(final_model_data)

0.2054581301234824

In [213]:
len(train_df)/len(final_model_data)

0.7945418698765175

In [214]:
train_df.to_csv("processed/train_data.csv", index=False)
test_df.to_csv("processed/test_data.csv", index=False)

#