In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr
import hervpd as hp

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 120
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 600
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [3]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
220,leisure,6363,5700,1487.846968,43.388818,1941.255326,1.304741,56.611182,69.188192,880.517523,...,sit,18,71.337581,101.138014,2017-10-30 00:35:40,2017-10-30 02:10:40,5411.408348,1,1982.306055,220
20,rest-active,560,377,21894.297153,72.21408,8424.301528,0.384771,27.78592,100.126485,656.535714,...,sitting,57,234.657483,189.956239,2017-10-04 10:33:19,2017-10-04 10:39:36,33795.884647,0,3477.285966,20
210,focused-active,2912,1770,177.795281,24.648771,543.519704,3.056997,75.351229,117.859189,517.643201,...,sit,2,22.789939,68.210886,2018-05-06 18:58:53,2018-05-06 19:28:23,1223.716477,0,502.401491,210
367,leisure,1009,900,594.17441,35.984121,1057.038398,1.779004,64.015879,66.477347,910.994054,...,sit,0,43.235661,82.401504,2017-10-26 21:16:00,2017-10-26 21:31:00,3044.455597,2,1393.242789,367
117,household-chores,3789,2371,1081.916202,58.147454,778.726224,0.719766,41.852546,104.726705,589.499076,...,stand,85,92.168228,90.76841,2017-11-05 15:00:44,2017-11-05 15:40:15,2380.411983,0,519.769557,117


In [4]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sitting', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [5]:
frags = cf.fragment_sessions(sessions, duration, crop)

376 valid sessions out of 450 total (at least one full fragment of 600 seconds after discarding first 120 seconds)


In [6]:
print(len(frags))
print(frags[0])

2050
{'start': Timestamp('2017-09-29 12:27:28'), 'stop': Timestamp('2017-09-29 12:37:28'), 'activity': 'focused-active', 'posture': 'sitting', 'user': 0, 'sess': 0, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [7]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 1000 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [8]:
%time fragall(frags, RAW_PATH)

0 / 2050
1000 / 2050
2000 / 2050
Wall time: 2min 54s


In [9]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,2050.0,2050.0,2050.0,2050.0
mean,11.816098,200.081463,1.189756,745.574634
std,13.683048,139.962341,1.906705,260.301221
min,0.0,0.0,0.0,0.0
25%,1.0,68.0,0.0,625.0
50%,5.0,169.0,0.0,744.0
75%,20.0,317.75,2.0,844.75
max,56.0,449.0,6.0,1811.0


### 3.2 Remove outliers from RR series

In [10]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,2050.0,2050.0,2050.0,2050.0
mean,11.816098,200.081463,1.189756,741.391707
std,13.683048,139.962341,1.906705,258.21902
min,0.0,0.0,0.0,0.0
25%,1.0,68.0,0.0,622.0
50%,5.0,169.0,0.0,742.0
75%,20.0,317.75,2.0,839.75
max,56.0,449.0,6.0,1811.0


### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

***TODO*** remove using the continuous sequence gaps instead, it is more reliable

In [11]:
df = df[df['beatcount'] > 0.83 * duration]
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,1914.0,1914.0,1914.0,1914.0
mean,11.376698,197.907524,1.150993,783.738767
std,13.354145,138.968342,1.869412,204.554063
min,0.0,0.0,0.0,500.0
25%,1.0,68.0,0.0,647.0
50%,5.0,169.0,0.0,754.0
75%,19.0,309.75,2.0,848.0
max,56.0,449.0,6.0,1811.0


## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [12]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

296.586501388


In [13]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,...,posture,rmssd,rr,sdnn,sess,start,stop,total_power,user,vlf
588,sleep,929,799.011872,60.181465,528.65916,0.661641,39.818535,95.43209,656.828848,58,...,lie,43.235235,"[{'date': 2017-10-31 19:45:20, 'interval': 731...",126.95093,102,2017-10-31 19:45:19,2017-10-31 19:55:19,2068.329593,0,740.658562
724,leisure,897,193.015429,23.203082,638.837114,3.309772,76.796918,88.242539,683.944259,34,...,sit,26.35485,"[{'date': 2017-11-15 14:20:01, 'interval': 723...",50.316201,143,2017-11-15 14:20:00,2017-11-15 14:30:00,1589.436861,0,757.584318
1759,movement,921,68.308306,28.294596,173.109902,2.534244,71.705404,90.231746,666.155266,8,...,stand,16.309906,"[{'date': 2017-12-27 15:55:01, 'interval': 630...",28.353721,426,2017-12-27 15:55:00,2017-12-27 16:05:00,541.134305,5,299.716097


### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** see item 3.3 above

In [14]:
dfr = df[(df['hf'] < 7000) & (df['hf'] < 7000)]
print(len(df), 'original and', len(dfr), 'after pruning')

1914 original and 1510 after pruning


## 5 - Save

---

In [15]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0,1510.0
mean,820.777483,1306.446175,46.244696,1417.784507,1.758384,53.755304,80.977161,784.319377,137.088079,7.862914,18.656947,56.7903,77.758846,224.22649,3985.868488,1.421192,1261.637806
std,208.646271,1414.291053,19.075609,2029.301009,1.710714,19.075609,18.49872,158.613543,100.942777,10.119584,14.921384,33.346825,35.398415,138.391763,3818.409276,1.974678,1273.711191
min,500.0,0.2966,7.268512,3.574621,0.120357,10.742739,49.159341,338.323026,0.0,0.0,0.0,1.836384,9.315941,6.0,20.511915,0.0,6.154952
25%,707.0,367.218345,30.680889,467.242337,0.629621,38.636035,70.933423,699.340104,51.0,1.0,6.090452,31.856955,52.289845,107.0,1486.644426,0.0,490.598371
50%,780.0,798.073554,46.467396,1000.557148,1.152047,53.532604,77.790417,781.69699,123.5,3.0,15.567908,48.62279,71.521238,196.0,3003.198015,0.0,888.934096
75%,871.0,1689.858098,61.363965,1778.248545,2.25937,69.319111,86.841409,853.681407,202.0,12.0,27.701604,74.008152,94.48322,367.75,5060.444885,2.0,1581.358851
max,1811.0,6984.772837,89.257261,25723.146082,12.757975,92.731488,177.430513,1225.612,524.0,56.0,68.6,182.535075,241.4624,449.0,34344.333719,6.0,11394.045463


In [16]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_600_120.xlsx


# Applying all steps above to generate different datasets

In [17]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [120, 90, 60]

durations = [600, 450, 300, 240, 180, 120, 60]
crops = [90, 30]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

In [18]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_600_90.xlsx ...
379 valid sessions out of 450 total (at least one full fragment of 600 seconds after discarding first 90 seconds)
0 / 2064
1000 / 2064
2000 / 2064
2064 total frags and 1542 kept
resulting dataset: 1542 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_450_90.xlsx ...
403 valid sessions out of 450 total (at least one full fragment of 450 seconds after discarding first 90 seconds)
0 / 2814
1000 / 2814
2000 / 2814
2814 total frags and 2092 kept
resulting dataset: 2092 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_300_90.xlsx ...
413 valid sessions out of 450 total (at least one full fragment of 300 seconds after discarding first 90 seconds)
0 / 4321
1000 / 4321
2000 / 4321
3000 / 4321
4000 / 4321
4321 total frags and 3230 kept
resulting dataset: 3230 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_240_90.xlsx ...
420 valid sessions out

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


11162 total frags and 8479 kept
resulting dataset: 8479 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_60_90.xlsx ...
445 valid sessions out of 450 total (at least one full fragment of 60 seconds after discarding first 90 seconds)
0 / 22562
1000 / 22562
2000 / 22562
3000 / 22562
4000 / 22562
5000 / 22562
6000 / 22562
7000 / 22562
8000 / 22562
9000 / 22562
10000 / 22562
11000 / 22562
12000 / 22562
13000 / 22562
14000 / 22562
15000 / 22562
16000 / 22562
17000 / 22562
18000 / 22562
19000 / 22562
20000 / 22562
21000 / 22562
22000 / 22562


  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

22562 total frags and 17244 kept
resulting dataset: 17244 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_600_30.xlsx ...
390 valid sessions out of 450 total (at least one full fragment of 600 seconds after discarding first 30 seconds)
0 / 2108
1000 / 2108
2000 / 2108
2108 total frags and 1582 kept
resulting dataset: 1582 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_450_30.xlsx ...
406 valid sessions out of 450 total (at least one full fragment of 450 seconds after discarding first 30 seconds)
0 / 2856
1000 / 2856
2000 / 2856
2856 total frags and 2129 kept
resulting dataset: 2129 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_300_30.xlsx ...
420 valid sessions out of 450 total (at least one full fragment of 300 seconds after discarding first 30 seconds)
0 / 4407
1000 / 4407
2000 / 4407
3000 / 4407
4000 / 4407
4407 total frags and 3319 kept
resulting dataset: 3319 records
generating C:\Users\ju\GDrive\Project

  .format(nperseg, input_length))


23011 total frags and 17645 kept
resulting dataset: 17645 records
Wall time: 2h 34min


## ( Extra - save files for LDA Grover)

---

In [19]:
dfr[['activity']].to_csv('./classifications')

In [20]:
def get_ints(beats):
    return [beat['interval'] for beat in beats]

dfr['ts'] = dfr['rr'].apply(get_ints)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
dfr.loc[['ts']].sample(3)

KeyError: "None of [['ts']] are in the [index]"

In [None]:
dfr[['ts']].to_csv('./timeseries')

In [None]:
dfr[cl.features_all].to_csv('./features')

In [None]:
dfr.describe()

In [None]:
for a in d.activity.unique()
       df.groupby(column).count()['user']