# Labeling the Data

This notebook labels each second of each raw GSR stream as a 1 if it includes a peak or with a 0 if it doesn't.

In [36]:
from datetime import datetime, time, timedelta

import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [37]:
pd.set_option('display.max_rows', 500)

# Get Peak Data

In [252]:
def preprocess_frame(df):
    return df.drop(columns=['AlgorithmName', 'Respondent', 'Markers'])

def ingest_dir_as_dataframes(the_dir):
    '''
    ingests a directory of .csv files and returns them as a list of dataframes
    '''
    path = the_dir
    all_files = glob.glob(path + "/*.txt")
    the_frames = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, sep='\t')
        df = preprocess_frame(df)
        the_frames.append(df)
    return the_frames

In [253]:
frames = ingest_dir_as_dataframes('../../data/raw/Peaks/GSRPeaks/')
peaks = pd.concat(frames)
peaks.head()

Unnamed: 0,Segmentation,Stimulus,OnsetMs,PeakMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
0,D4S1,Forest,18461,20575,23759,0.468,1,0.111429,,
1,D4S1,Forest,30715,31495,32992,0.06,2,,,
2,D4S1,Forest,35438,35886,38369,0.0165,3,,,
3,D4S1,Forest,51013,53057,55676,0.1035,4,,,
4,D4S1,Forest,114536,115738,120750,0.021,5,,,


## Cleaning

Discard corrupted D1D2 data.

In [254]:
peaks = peaks[peaks['Segmentation'] != 'D1D2']

## Wrangling

In [255]:
# align peak col names with GSR col names
peaks['Segmentation'] = peaks['Segmentation'].apply(lambda x: x + 'P2')
peaks.head()

Unnamed: 0,Segmentation,Stimulus,OnsetMs,PeakMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
0,D4S1P2,Forest,18461,20575,23759,0.468,1,0.111429,,
1,D4S1P2,Forest,30715,31495,32992,0.06,2,,,
2,D4S1P2,Forest,35438,35886,38369,0.0165,3,,,
3,D4S1P2,Forest,51013,53057,55676,0.1035,4,,,
4,D4S1P2,Forest,114536,115738,120750,0.021,5,,,


In [256]:
peaks = peaks.replace('1 37988', '37988')

In [257]:
peaks['PeakMs'] = peaks['PeakMs'].astype(int)
peaks = peaks[peaks['PeakMs'] >= 0 ]

In [258]:
biomes = {}
for biome in ['Stream', 'Ocean', 'Mountain', 'Forest']:
    print(biome)
    biome_df = peaks[peaks['Stimulus'] == biome]
    biome_df = biome_df.set_index('PeakMs').sort_index()
    biomes[biome] = biome_df

Stream
Ocean
Mountain
Forest


### convert peakMs to Python datetime

In [259]:
peaks['PeakMs'] = pd.to_datetime(peaks['PeakMs'])

In [260]:
peaks.head()

Unnamed: 0,Segmentation,Stimulus,OnsetMs,PeakMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
0,D4S1P2,Forest,18461,1970-01-01 00:00:00.000020575,23759,0.468,1,0.111429,,
1,D4S1P2,Forest,30715,1970-01-01 00:00:00.000031495,32992,0.06,2,,,
2,D4S1P2,Forest,35438,1970-01-01 00:00:00.000035886,38369,0.0165,3,,,
3,D4S1P2,Forest,51013,1970-01-01 00:00:00.000053057,55676,0.1035,4,,,
4,D4S1P2,Forest,114536,1970-01-01 00:00:00.000115738,120750,0.021,5,,,


### Wrangle GSR Response Data

In [261]:
gsr = pd.read_csv('../../data/intermediary/normedGSRaudioSensorData.csv', index_col='Timestamp')
gsr.index = pd.to_datetime(gsr.index)

In [262]:
gsr.columns

Index(['D5S2P2', 'D5S1P2', 'D7P1', 'D10P1', 'D4S1P2', 'D3S2P2', 'D6S2P1',
       'D9S2P1', 'D8P2', 'D9S1P1', 'D4S2P2', 'D3S1P2', 'D6S1P1', 'D4S2P1',
       'D3S1P1', 'D6S1P2', 'D9S1P2', 'D9S2P2', 'D8P1', 'D4S1P1', 'D3S2P1',
       'D6S2P2', 'D7P2', 'D10P2', 'D5S1P1', 'D5S2P1'],
      dtype='object')

In [263]:
gsr.shape

(27261, 26)

In [264]:
gsr.head()

Unnamed: 0_level_0,D5S2P2,D5S1P2,D7P1,D10P1,D4S1P2,D3S2P2,D6S2P1,D9S2P1,D8P2,D9S1P1,...,D9S1P2,D9S2P2,D8P1,D4S1P1,D3S2P1,D6S2P2,D7P2,D10P2,D5S1P1,D5S2P1
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-27 00:00:00.000,0.995511,0.974168,0.184426,0.0,0.675496,0.995275,1.0,0.996656,0.965902,0.923713,...,0.730392,0.237066,0.429648,0.721657,0.693862,0.0,0.949416,0.405239,1.0,0.996313
2019-02-27 00:00:00.025,0.995735,0.974602,0.184174,0.001481,0.679067,0.995275,0.998031,1.0,0.965902,0.92811,...,0.733543,0.236243,0.427136,0.721356,0.69559,0.002288,0.948473,0.405239,0.999554,0.996928
2019-02-27 00:00:00.050,0.99596,0.975035,0.183922,0.001111,0.683462,0.997637,0.997232,0.998328,0.965246,0.932507,...,0.736695,0.23542,0.426382,0.721054,0.69559,0.001144,0.94753,0.405239,0.999108,0.996313
2019-02-27 00:00:00.075,0.996184,0.975469,0.18367,0.000741,0.68566,1.0,0.996432,0.996656,0.965464,0.936904,...,0.739846,0.234939,0.425628,0.720752,0.69559,0.002288,0.946586,0.405239,0.998662,0.996313
2019-02-27 00:00:00.100,0.996409,0.975903,0.183418,0.00037,0.697966,0.998819,0.995632,0.996656,0.965246,0.941301,...,0.742997,0.234457,0.424874,0.72045,0.69559,0.002288,0.945172,0.40483,0.999554,0.996518


#### Resolve Peak Segmentation Subject Names with GSR Subject Names

_"The left person is always P2 (person 2), and they were always wearing the GSR device 92B2."_

'D4S1P2',
 'D6S1P2',
 'D3S2P2',
 'D10P2',
 'D8P2',
 'D9S2P2',
 'D5S2P2',
 'D9S1P2',
 'D5S1P2',
 'D7P2',
 'D6S2P2',
 'D3S1P2',
 'D1D2P2',
 'D4S2P2']

In [265]:
keep_cols = [col for col in gsr.columns if 'P2' in col]
sorted(keep_cols)

['D10P2',
 'D3S1P2',
 'D3S2P2',
 'D4S1P2',
 'D4S2P2',
 'D5S1P2',
 'D5S2P2',
 'D6S1P2',
 'D6S2P2',
 'D7P2',
 'D8P2',
 'D9S1P2',
 'D9S2P2']

In [266]:
gsr = gsr[keep_cols]
gsr.head()

Unnamed: 0_level_0,D5S2P2,D5S1P2,D4S1P2,D3S2P2,D8P2,D4S2P2,D3S1P2,D6S1P2,D9S1P2,D9S2P2,D6S2P2,D7P2,D10P2
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-02-27 00:00:00.000,0.995511,0.974168,0.675496,0.995275,0.965902,0.937234,0.62614,0.62529,0.730392,0.237066,0.0,0.949416,0.405239
2019-02-27 00:00:00.025,0.995735,0.974602,0.679067,0.995275,0.965902,0.940087,0.452888,0.62413,0.733543,0.236243,0.002288,0.948473,0.405239
2019-02-27 00:00:00.050,0.99596,0.975035,0.683462,0.997637,0.965246,0.944081,0.476596,0.62413,0.736695,0.23542,0.001144,0.94753,0.405239
2019-02-27 00:00:00.075,0.996184,0.975469,0.68566,1.0,0.965464,0.950928,0.500304,0.62471,0.739846,0.234939,0.002288,0.946586,0.405239
2019-02-27 00:00:00.100,0.996409,0.975903,0.697966,0.998819,0.965246,0.954922,0.524012,0.62471,0.742997,0.234457,0.002288,0.945172,0.40483


### Wrangle PeakMs Column of Peaks

In [267]:
peaks.head()

Unnamed: 0,Segmentation,Stimulus,OnsetMs,PeakMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
0,D4S1P2,Forest,18461,1970-01-01 00:00:00.000020575,23759,0.468,1,0.111429,,
1,D4S1P2,Forest,30715,1970-01-01 00:00:00.000031495,32992,0.06,2,,,
2,D4S1P2,Forest,35438,1970-01-01 00:00:00.000035886,38369,0.0165,3,,,
3,D4S1P2,Forest,51013,1970-01-01 00:00:00.000053057,55676,0.1035,4,,,
4,D4S1P2,Forest,114536,1970-01-01 00:00:00.000115738,120750,0.021,5,,,


### Learn to Compare....

In [268]:
peaks['PeakMs'].iloc[0].time() < gsr.index[0].time()

False

In [269]:
peaks['PeakMs'].iloc[1].time()

datetime.time(0, 0, 0, 31)

In [270]:
gsr.index[5]

Timestamp('2019-02-27 00:00:00.125000')

In [271]:
gsr.index[5] + timedelta(milliseconds=25) # days, seconds, then other fields.

Timestamp('2019-02-27 00:00:00.150000')

### Wrangle GSR

For each row in gsr[timestamp], treat the value as the range_bottom and check every value in peaks['PeakMs'] for possible membership in the timestamp's 25 ms interval.

In [272]:
col_name = gsr.columns[1]
col_name

'D5S1P2'

In [273]:
col = gsr[col_name]
col.head()

Timestamp
2019-02-27 00:00:00.000    0.974168
2019-02-27 00:00:00.025    0.974602
2019-02-27 00:00:00.050    0.975035
2019-02-27 00:00:00.075    0.975469
2019-02-27 00:00:00.100    0.975903
Name: D5S1P2, dtype: float64

In [274]:
base_series = pd.DataFrame(col)

In [275]:
base_series.head()

Unnamed: 0_level_0,D5S1P2
Timestamp,Unnamed: 1_level_1
2019-02-27 00:00:00.000,0.974168
2019-02-27 00:00:00.025,0.974602
2019-02-27 00:00:00.050,0.975035
2019-02-27 00:00:00.075,0.975469
2019-02-27 00:00:00.100,0.975903


### Wrangle Peaks

#### Reframe Peaks as a Time Series

In [276]:
peaks = peaks.set_index('PeakMs')

In [277]:
peaks['Amplitude'].describe()

count    1184.000000
mean        0.106061
std         0.191567
min         0.005014
25%         0.015497
50%         0.043500
75%         0.119223
max         3.277800
Name: Amplitude, dtype: float64

In [278]:
peaks = peaks[peaks['Amplitude'] >= .119223]

In [279]:
peaks.head()

Unnamed: 0_level_0,Segmentation,Stimulus,OnsetMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
PeakMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.000020575,D4S1P2,Forest,18461,23759,0.468,1,0.111429,,
1970-01-01 00:00:00.000051750,D4S1P2,Mountain,47885,53748,0.882,2,,,
1970-01-01 00:00:00.000024758,D4S1P2,Stream,23232,27109,0.174,2,,,
1970-01-01 00:00:00.000071525,D4S1P2,Stream,70249,75099,0.3564,6,,,
1970-01-01 00:00:00.000089199,D4S1P2,Stream,88317,91898,0.147,7,,,


In [280]:
peaks_one_subject = peaks[peaks['Segmentation'] == col_name]

In [281]:
peaks_one_subject.head()

Unnamed: 0_level_0,Segmentation,Stimulus,OnsetMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
PeakMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.000108550,D5S1P2,Forest,106787,111059,0.125801,2,,,
1970-01-01 00:00:00.000007124,D5S1P2,Ocean,3742,8895,0.159,1,0.036509,,
1970-01-01 00:00:00.000104620,D5S1P2,Stream,103113,106473,0.1245,14,,,
1970-01-01 00:00:00.000014390,D5S1P2,Forest,14114,14744,0.149427,7,,,
1970-01-01 00:00:00.000019273,D5S1P2,Forest,18515,20469,0.129513,10,,,


#### Use GSR index to select the Peaks Data

In [282]:
peaks_one_subject = peaks_one_subject.resample('1second').count()

ValueError: Invalid frequency: 1second

In [283]:
peaks_one_subject.head()

Unnamed: 0_level_0,Segmentation,Stimulus,OnsetMs,OffsetMs,Amplitude,StimulusPeakSequenceNr,Unnamed: 10,TotalPeakSequenceNr,Unnamed: 11
PeakMs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.000108550,D5S1P2,Forest,106787,111059,0.125801,2,,,
1970-01-01 00:00:00.000007124,D5S1P2,Ocean,3742,8895,0.159,1,0.036509,,
1970-01-01 00:00:00.000104620,D5S1P2,Stream,103113,106473,0.1245,14,,,
1970-01-01 00:00:00.000014390,D5S1P2,Forest,14114,14744,0.149427,7,,,
1970-01-01 00:00:00.000019273,D5S1P2,Forest,18515,20469,0.129513,10,,,


In [284]:
c0.head()

Timestamp
2019-02-27 00:00:00.000    False
2019-02-27 00:00:00.025     True
2019-02-27 00:00:00.050     True
2019-02-27 00:00:00.075    False
2019-02-27 00:00:00.100    False
Name: Timestamp, dtype: bool

In [285]:
peaks[c0]

  """Entry point for launching an IPython kernel.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).