In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr
import hervpd as hp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [3]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 90
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 60
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [6]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
109,movement,2243,953,63.025625,27.061168,169.874985,2.695332,72.938832,139.069132,434.089166,...,stand,0,14.748668,35.248096,2017-11-01 11:29:55,2017-11-01 11:45:48,425.140328,0,192.239718,109
203,focused-passive,1228,3896,64.892556,13.183472,427.334035,6.585255,86.816528,113.622463,531.393322,...,sit,0,14.030122,42.734077,2018-04-17 21:07:27,2018-04-17 22:12:23,873.852664,0,381.626073,203
279,sleep,1215,1740,1608.040246,54.201058,1358.765741,0.844982,45.798942,56.123092,1087.138272,...,lie,3,69.467283,113.437871,2017-09-30 06:00:00,2017-09-30 06:29:00,5163.786967,2,2196.98098,279
19,sleep,27837,26160,15803.053319,79.900869,3975.271449,0.251551,20.099131,65.975786,955.910048,...,lying down,185,190.212028,203.053604,2017-10-04 00:44:00,2017-10-04 08:00:00,21842.918036,0,2064.593267,19
303,focused-passive,1087,900,802.501594,29.662046,1902.981356,2.371312,70.337954,72.105507,847.437902,...,sit,0,43.855058,107.318572,2017-10-06 11:16:00,2017-10-06 11:31:00,5278.123295,2,2572.640345,303


In [7]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sitting', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [None]:
frags = cf.fragment_sessions(sessions, duration, crop)

In [None]:
print(len(frags))
print(frags[0])

## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [None]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 1000 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [None]:
%time fragall(frags, RAW_PATH)

In [None]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.2 Remove outliers from RR series

In [None]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

***TODO*** remove using the continuous sequence gaps instead, it is more reliable

In [None]:
df = df[df['beatcount'] > 0.83 * duration]
df.describe()

## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [None]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

In [None]:
df = pd.DataFrame(dic)
df.sample(3)

### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** see item 3.3 above

In [None]:
dfr = df[(df['hf'] < 7000) & (df['hf'] < 7000)]
print(len(df), 'original and', len(dfr), 'after pruning')

## 5 - Save

---

In [None]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

In [None]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

# Applying all steps above to generate different datasets

In [8]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [120, 90, 60]

durations = [600, 450, 300, 240, 180, 120, 60]
crops = [90, 30]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

In [9]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_600_90.xlsx ...
378 valid sessions out of 449 total (at least one full fragment of 600 seconds after discarding first 90 seconds)
0 / 2078
1000 / 2078
2000 / 2078
2078 total frags and 1538 kept
resulting dataset: 1538 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_450_90.xlsx ...
402 valid sessions out of 449 total (at least one full fragment of 450 seconds after discarding first 90 seconds)
0 / 2833
1000 / 2833
2000 / 2833
2833 total frags and 2087 kept
resulting dataset: 2087 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_300_90.xlsx ...
412 valid sessions out of 449 total (at least one full fragment of 300 seconds after discarding first 90 seconds)
0 / 4349
1000 / 4349
2000 / 4349
3000 / 4349
4000 / 4349
4349 total frags and 3222 kept
resulting dataset: 3222 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_240_90.xlsx ...
419 valid sessions out

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


11231 total frags and 8457 kept
resulting dataset: 8457 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_60_90.xlsx ...
444 valid sessions out of 449 total (at least one full fragment of 60 seconds after discarding first 90 seconds)
0 / 22699
1000 / 22699
2000 / 22699
3000 / 22699
4000 / 22699
5000 / 22699
6000 / 22699
7000 / 22699
8000 / 22699
9000 / 22699
10000 / 22699
11000 / 22699
12000 / 22699
13000 / 22699
14000 / 22699
15000 / 22699
16000 / 22699
17000 / 22699
18000 / 22699
19000 / 22699
20000 / 22699
21000 / 22699
22000 / 22699


  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

22699 total frags and 17199 kept
resulting dataset: 17199 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_600_30.xlsx ...
389 valid sessions out of 449 total (at least one full fragment of 600 seconds after discarding first 30 seconds)
0 / 2122
1000 / 2122
2000 / 2122
2122 total frags and 1578 kept
resulting dataset: 1578 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_450_30.xlsx ...
405 valid sessions out of 449 total (at least one full fragment of 450 seconds after discarding first 30 seconds)
0 / 2875
1000 / 2875
2000 / 2875
2875 total frags and 2124 kept
resulting dataset: 2124 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_300_30.xlsx ...
419 valid sessions out of 449 total (at least one full fragment of 300 seconds after discarding first 30 seconds)
0 / 4435
1000 / 4435
2000 / 4435
3000 / 4435
4000 / 4435
4435 total frags and 3311 kept
resulting dataset: 3311 records
generating C:\Users\ju\GDrive\Project

  .format(nperseg, input_length))


23147 total frags and 17599 kept
resulting dataset: 17599 records
Wall time: 2h 24min 33s


## ( Extra - save files for LDA Grover)

---

In [None]:
dfr[['activity']].to_csv('./classifications')

In [None]:
def get_ints(beats):
    return [beat['interval'] for beat in beats]

dfr['ts'] = dfr['rr'].apply(get_ints)


In [None]:
dfr.loc[['ts']].sample(3)

In [None]:
dfr[['ts']].to_csv('./timeseries')

In [None]:
dfr[cl.features_all].to_csv('./features')

In [None]:
dfr.describe()

In [None]:
for a in d.activity.unique()
       df.groupby(column).count()['user']