In [1]:
# !pip install tqdm
# !pip install joblib

In [2]:
%pylab inline

from os.path import join, isdir
from glob import glob
import pytz

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
sns.set(style="ticks", 
        rc={'image.cmap':'viridis','font.size':40, 'pdf.fonttype':42, 'image.interpolation':'none'}, 
        font_scale=1.5)
loadx = lambda f: pd.read_csv(f)

'''
def fill_events(df):
    transitions = df[df.start_anno.isnull()==False][['start_anno','stop_anno']]
    transitions.sort(inplace=True)

    # Fill in event types
    df['anno'] = [np.nan]*len(df)
    df.anno[:transitions.index[0]] = transitions.start_anno[0]
    df.anno[transitions.index[-1]:] = transitions.stop_anno[-1]

    # Set the annotations
    for i in range(len(transitions)-1):
        tl,tr = transitions.iloc[i],transitions.iloc[i+1]
        df.anno[tl.name:tr.name] = tl.start_anno
    return df
    '''

Populating the interactive namespace from numpy and matplotlib


"\ndef fill_events(df):\n    transitions = df[df.start_anno.isnull()==False][['start_anno','stop_anno']]\n    transitions.sort(inplace=True)\n\n    # Fill in event types\n    df['anno'] = [np.nan]*len(df)\n    df.anno[:transitions.index[0]] = transitions.start_anno[0]\n    df.anno[transitions.index[-1]:] = transitions.stop_anno[-1]\n\n    # Set the annotations\n    for i in range(len(transitions)-1):\n        tl,tr = transitions.iloc[i],transitions.iloc[i+1]\n        df.anno[tl.name:tr.name] = tl.start_anno\n    return df\n    "

In [3]:
base_dir = join("C:/", "Users", "Owner", "Documents", "Baker Lab", "Embrace", "Data", "Data", "half1", "F1")

In [4]:
patient_dirs = [d for d in glob(join(base_dir,'*')) if isdir(d)]

Send help: What is patient_dirs supposed to be exactly? A folder? All folders in a larger folder? Where was that larger folder indicated? Is it base_dir?  What is base_dir?

Also, what's the best way to convert it such that it looks at *my* directories?

In [5]:
# Transall is the file name? make sure that there's only one.
# All the actigraphy files are split into multiple sessions: that's what the joining does: puts them together into one file
# Not putting them together, but keeping them together in the same structure

### Find the annotation & actigraphy files

In [6]:
metadata = []
for pdir in patient_dirs:
    patient = pdir.split("/")[-1]
    event_file = glob(join(pdir,'redcap/processed/*trans_all*')) or [""]
    assert len(event_file) == 1, "Should only be one summary file"
    event_file = event_file[0]
    if not event_file:
        continue
    acc_files = glob(join(pdir,'actigraphy/raw/*acc*'))
    eda_files = glob(join(pdir,'actigraphy/raw/*eda*'))
    temp_files = glob(join(pdir,'actigraphy/raw/*temp*'))
    report_files = glob(join(pdir,'actigraphy/raw/*report*'))
    
    if not (len(acc_files) and len(eda_files) and len(temp_files) and len(report_files)):
        continue

    metadata.append(
        dict(patient=patient,
            event_file=event_file,
            patient_dir=pdir,
            acc_files=acc_files,
            eda_files=eda_files,
            temp_files=temp_files,
            report_files=report_files)
    )

In [7]:
# Pandas is like excel but better -- convert to pandas asap
# load is just from the beginning: solves typing.

### Load the annotation & actigraphy files

In [8]:
for m in tqdm(metadata):
    
    # Load the actigraphy
    # Load everything, concatenate files. Adds keys into each dictionary.
    m['accdf'] = pd.concat([loadx(acc_file) for acc_file in m['acc_files']],axis=0)
    m['edadf'] = pd.concat([loadx(eda_file) for eda_file in m['eda_files']],axis=0)
    m['tempdf'] = pd.concat([loadx(temp_file) for temp_file in m['temp_files']],axis=0)
    m['actdf'] = pd.concat([loadx(temp_file) for temp_file in m['report_files']],axis=0)

    # Load the events
    # 'event_file' is a csv style, read it as a string, parse it, care only about certain keys
    # Delete new lines, split by spaces, extract fields, extracting annotations
    events = []
    with open(m['event_file'],'r') as f:
        lines = f.readlines()
        for l in lines[1:]:
            entry = dict()
            l = l.replace('\n','')
            fields = l.split(' ')
            entry['patient'],entry['YMD'],entry['HMS'] = fields[:3]
            anno = ' '.join(fields[3:])
            start,stop = anno.replace('{','').replace('}','').split(':',maxsplit=1)
            start = start.split(',')
            stop = stop.split(',')
            entry['start_anno'],entry['start_rest'] = start[0],start[1:]
            entry['stop_anno'],entry['stop_rest'] = stop[0],stop[1:]
            events.append(entry)

    m['eventdf'] = pd.DataFrame(events)

100%|████████████████████████████████████████████| 1/1 [00:03<00:00,  4.00s/it]


### Get the dates right

In [9]:
tz = pytz.timezone('America/New_York')
for m in tqdm(metadata):

    # Get the dates right on the events
    hour,minute,second = zip(*[map(int,hms.split(':')) for hms in m['eventdf']['HMS']])
    year,month,day = zip(*[map(int,ymd.split('-')) for ymd in m['eventdf']['YMD']])
    dates = [
        pytz.datetime.datetime(year=year_,
                               month=month_,
                               day=day_,
                               hour=hour_,
                               minute=minute_,
                               second=second_,
                               tzinfo=tz)
        for year_,month_,day_,hour_,minute_,second_
        in zip(year,month,day,hour,minute,second)]
    m['eventdf'].insert(0,'ts',dates)
    del m['eventdf']['HMS']
    del m['eventdf']['YMD']

    # Get the dates right on the actigraphy report
    def convert_actigraphy_report_ts(str_t):
        d = pytz.datetime.datetime.strptime(str_t,'%Y-%m-%d %H:%M:%S')
        return d.replace(tzinfo=tz)
    m['actdf'].insert(0,'ts',
                      [convert_actigraphy_report_ts(t)
                       for t in m['actdf']['Timestamp (UTC)']])
    del m['actdf']['Timestamp (UTC)']
    del m['actdf']['Timezone offset']

    # Get the dates right on the actigraphy raw data
    m['edadf'].insert(0,'ts',
                      [pytz.datetime.datetime.utcfromtimestamp(t/1e3).replace(tzinfo=tz)
                     for t in m['edadf']['timestamp_milliseconds']])
    del m['edadf']['timestamp_milliseconds']

    m['accdf'].insert(0,'ts',
                      [pytz.datetime.datetime.utcfromtimestamp(t/1e3).replace(tzinfo=tz)
                     for t in m['accdf']['timestamp_milliseconds']])
    del m['accdf']['timestamp_milliseconds']

    m['tempdf'].insert(0,'ts',
                      [pytz.datetime.datetime.utcfromtimestamp(t/1e3).replace(tzinfo=tz)
                     for t in m['tempdf']['timestamp_milliseconds']])
    del m['tempdf']['timestamp_milliseconds']
    
    # Make sure to verify this cell. It's incredibly important: any error leads to more errors in the future
    # Eventually: all things recorded from devices get an index (starting time) from the same timezone

    m['eventdf'].set_index('ts',inplace=True)
    m['actdf'].set_index('ts',inplace=True)
    m['edadf'].set_index('ts',inplace=True)
    m['accdf'].set_index('ts',inplace=True)
    m['tempdf'].set_index('ts',inplace=True)
    

100%|████████████████████████████████████████████| 1/1 [00:44<00:00, 45.00s/it]


In [10]:
for m in metadata:
    m['actdf'] = m['actdf'].rename(
    columns={
        'Acceleration magnitude [normalised by g. The interval is between 0 and 28 g]':'mean_acc_magnitude',
        'Skin temperature':'skin_temperature',
        'EDA':'eda',
        'MET':'met'})

In [11]:
# Fix the stuff
# Add in acceleration magnitude
# adding things can go here
for m in tqdm(metadata):
    m['accdf']['acc_magnitude'] = np.sqrt(m['accdf'].X**2.0
                                          + m['accdf'].Y**2.0
                                          + m['accdf'].Z**2.0)

100%|████████████████████████████████████████████| 1/1 [00:00<00:00,  1.27it/s]


### Do some joining

In [12]:
# Add annotations to all sensors: events have same timestamp as the other three
# apparently can get rid of the predefined function at the beginning
mdf = {}
modalities = ['edadf','tempdf','accdf']
for modality in modalities: # for each type of analysis
    mdf[modality] = pd.DataFrame()

    for m in tqdm(metadata): # for each patient
        # Join the current modality and events
        left = m[modality] # accelerometer data
        right = m['eventdf'][['patient','start_anno']] # annotations
        df_ = left.join(right,how='outer',sort=True) # rows joined on time: chance that
        # empatica was turned on when the person started watching is pretty much 0
        # Therefore: df_ = df_.fillna means fill in numbers?

        # Get left-most overlapping time index
        # time when annotations start: First timestamp when someone was looking at the person
        start = right.index[0]

        # Get right-most overlapping time index
        # Earlier time when device or annotations end
        stop = min(left.index[-1],right.index[-1])

        # Crop the joined df_
        # basically trim everything.
        df_ = df_[start:stop]

        # Fill in the annotations so they match the 
        # time index of the data modality
        df_['anno'] = df_['start_anno'] # What the person was doing until they were labeled to be doing something different
        del df_['start_anno']
        df_ = df_.fillna(method='ffill')
        
        mdf[modality] = pd.concat((mdf[modality],df_),axis=0)

100%|████████████████████████████████████████████| 1/1 [00:00<00:00,  2.59it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00,  5.42it/s]
100%|████████████████████████████████████████████| 1/1 [00:01<00:00,  1.42s/it]


### Save it out

In [13]:
# High level goal: can we predict someone going into a vigorous state before they've done so
# But that might not be the problem to solve: doesn't necessarily mean you need to be restrained
# But we're trying to solve when people need to be restrained
# And note that the annotation accuracies are kind of... eh. We need to be beating "bored people"

In [14]:

# '/Users/Joanna/figure out your own way to find the path...
# Figure out a goal that you want done at a certain date?
# Ask Alex as a resource to check progress

In [15]:
# We've done all the preprocessing we're going to do in this notebook.
# Save it out, and we'll continue the journey elsewhere.
from joblib import dump
with open(r"C:\Users\Owner\Documents\Baker Lab\Embrace\F1.bin", "wb") as f:
    data_file = f
    dump(mdf,data_file)