In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr
import hervpd as hp

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [None]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 90
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 300
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [None]:
df = pd.read_excel(sessfile)
df.sample(5)

In [None]:
sessions = df.to_dict(orient='records')
print(sessions[0])

## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [None]:
frags = cf.fragment_sessions(sessions, duration, crop)

In [None]:
print(len(frags))
print(frags[0])

## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [None]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 100 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [None]:
%time fragall(frags, RAW_PATH)

In [None]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.2 Remove outliers from RR series

In [None]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

In [None]:
df = df[df['beatcount'] > 0.6 * duration]
df.describe()

## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [None]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

In [None]:
df = pd.DataFrame(dic)
df.sample(3)

### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** it is best to actually remove the cause by separating continuous sequences in the interval 

In [None]:
dfr = df[df['hf'] < 7000]
print(len(df), 'original and', len(dfr), 'after pruning')

## 5 - Save

---

In [None]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

In [None]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

## 5 - Save LDA Grover

---

In [None]:
dfr[['activity']].to_csv('./classifications')

In [None]:
def get_ints(beats):
    return [beat['interval'] for beat in beats]

dfr['ts'] = dfr['rr'].apply(get_ints)


In [None]:
dfr.loc[['ts']].sample(3)

In [None]:
dfr[['ts']].to_csv('./timeseries')

In [None]:
dfr[cl.features_all].to_csv('./features')

In [None]:
dfr.describe()

In [None]:
for a in d.activity.unique()
       df.groupby(column).count()['user']

# Applying all steps above to generate different datasets

In [None]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [120, 90, 60]

durations = [450, 600]
crops = [90, 60]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

In [None]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)