In [0]:
import pandas as pd
import glob

In [0]:
path = r'/Users/juho/Downloads/Outdoor'
acc, gps, rss, label = [], [], [], []

sensors = ['Accelerometer', 'Real_GPS', 'Rss', 'Transition']

for sensor in sensors:
    all_files = glob.glob(f'{path}/*/{sensor}.csv')
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        print(df.shape)
        if sensor == 'Accelerometer':
            acc.append(df)
        elif sensor == 'Real_GPS':
            gps.append(df)
        elif sensor == 'Rss':
            rss.append(df)
        elif sensor == 'Transition':
            label.append(df)

(339378, 5)
(74992, 5)
(267889, 5)
(158980, 5)
(1618, 18)
(162, 18)
(1551, 18)
(412, 18)
(53354, 6)
(13306, 6)
(27825, 6)
(25510, 6)
(101, 2)
(30, 2)
(50, 2)
(48, 2)


### Convert date to unix timestamp

In [0]:
from datetime import datetime
GPS = []
for g in gps:
    GPS.append(g.assign(timeMillis = lambda x: x['time'].apply(lambda y: (datetime.strptime(y, "%Y-%m-%dT%H:%M:%S.%fZ").timestamp() + 32400) * 1000)))

### Select data after starting point (e.g. press -1 button)

In [0]:
for i, l in enumerate(label):
    startTime = l['timeMillis'][0]
    print(rss[i].shape)
    rss[i] = rss[i].loc[rss[i]['timeMillis']  > startTime]
    print(rss[i].shape)
    print(acc[i].shape)
    acc[i] = acc[i].loc[acc[i]['timeMillis']  > startTime]
    print(acc[i].shape)

(53354, 6)
(53354, 6)
(339378, 5)
(339378, 5)
(13306, 6)
(13306, 6)
(74992, 5)
(74992, 5)
(27825, 6)
(27825, 6)
(267889, 5)
(267889, 5)
(25510, 6)
(25510, 6)
(158980, 5)
(158980, 5)


### Select 5GHZ bandwidth

In [0]:
RSS_5GHZ = []
for r in rss:
    print(r.shape)
    RSS_5GHZ.append(r.loc[(r['frequency'] >= 5000) & (r['frequency'] < 6000)])
for r in RSS_5GHZ:
    print(r.shape)

(53354, 6)
(13306, 6)
(27825, 6)
(25510, 6)
(32482, 6)
(5974, 6)
(14560, 6)
(12583, 6)


### Select 2GHZ bandwidth

In [0]:
RSS_2GHZ = []
for r in rss:
    print(r.shape)
    RSS_2GHZ.append(r.loc[(r['frequency'] >= 2000) & (r['frequency'] < 3000)])
for r in RSS_2GHZ:
    print(r.shape)

(53354, 6)
(13306, 6)
(27825, 6)
(25510, 6)
(20872, 6)
(7332, 6)
(13265, 6)
(12927, 6)


In [0]:
print(acc[0]['timeMillis'].min())

1574385249607


In [0]:
start = []
for i in range(4):
    min_arr = [RSS_2GHZ[i]['timeMillis'].min(), RSS_5GHZ[i]['timeMillis'].min(), GPS[i]['timeMillis'].min(), acc[i]['timeMillis'].min()]
    start.append(max(min_arr))
end = []
for i in range(4):
    max_arr = [RSS_2GHZ[i]['timeMillis'].max(), RSS_5GHZ[i]['timeMillis'].max(), GPS[i]['timeMillis'].max(), acc[i]['timeMillis'].max()]
    end.append(min(max_arr))

In [0]:
print(start)
print(end)

[1574385251822, 1574764503287, 1574998672996.0, 1575247525741]
[1574388640923, 1574765894850, 1575001186231, 1575250157784]


In [0]:
GPS[0].head()['timeMillis'][0]

1574385242474.0

### Lowpass Filter (Moving Average)

In [0]:
from scipy import signal
import numpy as np
from pylab import *

def ma(x, n = 20):
    b = np.repeat(1.0/n, n) #Create impulse response
    xf = signal.lfilter(b, 1, x) #Filter the signal
    return(xf)

In [0]:
featured_rss = []
WIN_SIZE_IN_MS = 1000 
OVERLAP_RATIO = 0.5 

for i in range(4):
    START_TIME, END_TIME = start[i], end[i]
    
    # arange([start, ]stop, [step, ]dtype=None)
    # find the end time of each window by considering the overlapping ratio 
    WINDOWS = np.arange(START_TIME + WIN_SIZE_IN_MS, END_TIME, WIN_SIZE_IN_MS * (1 - OVERLAP_RATIO))

    # new dataframe 
    FEATURES_TIME = []
    for w in WINDOWS:
        # for a given window, set the start and end time stamps 
        win_start, win_end = w - WIN_SIZE_IN_MS, w
      
        for var in ['rss']:
            # select the rows that belong to the current window, w
            value_5GHZ = RSS_5GHZ[i].loc[
                lambda x: (x['timeMillis'] >= win_start) & (x['timeMillis'] < win_end), var
            ].values
            value_2GHZ = RSS_2GHZ[i].loc[
                lambda x: (x['timeMillis'] >= win_start) & (x['timeMillis'] < win_end), var
            ].values
            # extract basic features 
            #min_v = np.min(value) # min
            #max_v = np.max(value) # max
            #mean_v = np.mean(value) # mean
            #std_v = np.std(value) # std. dev.

            # append each result (w: current window's end-timestamp, extracted feature) as a new row
            #FEATURES_TIME.append((w - , '{}-{}'.format('Min', var), min_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Max', var), max_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Mean', var), mean_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Std', var), std_v))
            if (value_5GHZ.size > 0):
                FEATURES_TIME.append((w, f'{var}-5GHZ-MEAN', np.mean(value_5GHZ)))
                FEATURES_TIME.append((w, f'{var}-5GHZ-MAX', np.max(value_5GHZ)))
                FEATURES_TIME.append((w, f'{var}-5GHZ-MIN', np.min(value_5GHZ)))
                FEATURES_TIME.append((w, f'{var}-5GHZ-STD', np.std(value_5GHZ)))
            else:
                FEATURES_TIME.append((w, f'{var}-5GHZ-MEAN', np.nan))
                FEATURES_TIME.append((w, f'{var}-5GHZ-MAX', np.nan))
                FEATURES_TIME.append((w, f'{var}-5GHZ-MIN', np.nan))
                FEATURES_TIME.append((w, f'{var}-5GHZ-STD', np.nan))
            if (value_2GHZ.size > 0):
                FEATURES_TIME.append((w, f'{var}-2GHZ-MEAN', np.mean(value_2GHZ)))
                FEATURES_TIME.append((w, f'{var}-2GHZ-MAX', np.max(value_2GHZ)))
                FEATURES_TIME.append((w, f'{var}-2GHZ-MIN', np.min(value_2GHZ)))
                FEATURES_TIME.append((w, f'{var}-2GHZ-STD', np.std(value_2GHZ)))
            else:
                FEATURES_TIME.append((w, f'{var}-2GHZ-MEAN', np.nan))
                FEATURES_TIME.append((w, f'{var}-2GHZ-MAX', np.nan))
                FEATURES_TIME.append((w, f'{var}-2GHZ-MIN', np.nan))
                FEATURES_TIME.append((w, f'{var}-2GHZ-STD', np.nan))

    # FEATURES_TIME looks like this <timestamp, feature name, value> 
    # (4000.0, 'Min-x', -1.848), (4000.0, 'Max-x', -1.659), (4000.0, 'Mean-x', -1.7706786155747838), (4000.0, 'Std-x', 0.023614236975760955)

    # Create a dataframe w/ column names: timestamps, feature, value  
    FEATURES_TIME = pd.DataFrame(FEATURES_TIME, columns=['timestamps', 'feature', 'value'])

    # Reshape data to produce a pivot table based on column values
    FEATURES_TIME = FEATURES_TIME.pivot(index='timestamps', columns='feature', values='value').reset_index()
    featured_rss.append(FEATURES_TIME)
    

In [0]:
featured_gps = []
WIN_SIZE_IN_MS = 1000 
OVERLAP_RATIO = 0.5 

for i in range(4):
    START_TIME, END_TIME = start[i], end[i]
    
    # arange([start, ]stop, [step, ]dtype=None)
    # find the end time of each window by considering the overlapping ratio 
    WINDOWS = np.arange(START_TIME + WIN_SIZE_IN_MS, END_TIME, WIN_SIZE_IN_MS * (1 - OVERLAP_RATIO))

    # new dataframe 
    FEATURES_TIME = []
    for w in WINDOWS:
        # for a given window, set the start and end time stamps 
        win_start, win_end = w - WIN_SIZE_IN_MS, w
      
        for var in ['accuracy', 'satellites']:
            # select the rows that belong to the current window, w
            value = GPS[i].loc[
                lambda x: (x['timeMillis'] >= win_start) & (x['timeMillis'] < win_end), var
            ].values

            # extract basic features 
            #min_v = np.min(value) # min
            #max_v = np.max(value) # max
            #mean_v = np.mean(value) # mean
            #std_v = np.std(value) # std. dev.

            # append each result (w: current window's end-timestamp, extracted feature) as a new row
            #FEATURES_TIME.append((w - , '{}-{}'.format('Min', var), min_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Max', var), max_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Mean', var), mean_v))
            #FEATURES_TIME.append((w, '{}-{}'.format('Std', var), std_v))
            if (value.size > 0):
                FEATURES_TIME.append((w, f'{var}', np.mean(value)))
            else:
                FEATURES_TIME.append((w, f'{var}', np.nan))

    # FEATURES_TIME looks like this <timestamp, feature name, value> 
    # (4000.0, 'Min-x', -1.848), (4000.0, 'Max-x', -1.659), (4000.0, 'Mean-x', -1.7706786155747838), (4000.0, 'Std-x', 0.023614236975760955)

    # Create a dataframe w/ column names: timestamps, feature, value  
    FEATURES_TIME = pd.DataFrame(FEATURES_TIME, columns=['timestamps', 'feature', 'value'])

    # Reshape data to produce a pivot table based on column values
    FEATURES_TIME = FEATURES_TIME.pivot(index='timestamps', columns='feature', values='value').reset_index()
    featured_gps.append(FEATURES_TIME)
    

In [0]:
WIN_SIZE_IN_MS = 1000 
OVERLAP_RATIO = 0.5
BIN_SIZE = 128
featured_acc = []

for i in range(4):
    START_TIME, END_TIME = start[i], end[i]
    DURATION_IN_SEC = (END_TIME - START_TIME) / 1000
    F_s = len(acc[i].index) / DURATION_IN_SEC
    P = 1.0 / F_s
    # arange([start, ]stop, [step, ]dtype=None)
    # find the end time of each window by considering the overlapping ratio 
    WINDOWS = np.arange(START_TIME + WIN_SIZE_IN_MS, END_TIME, WIN_SIZE_IN_MS * (1 - OVERLAP_RATIO))

    # new dataframe 
    FEATURES_FREQ = []
    for w in WINDOWS:
        # for a given window, set the start and end time stamps 
        win_start, win_end = w - WIN_SIZE_IN_MS, w
      
        for var in ['x', 'y', 'z']:
            # select the rows that belong to the current window, w
            value = acc[i].loc[
                lambda x: (x['timeMillis'] >= win_start) & (x['timeMillis'] < win_end), var
            ].values
            if (value.size > 0):
                value = ma(value)

                fft = np.fft.fft(value * np.hamming(value.shape[0]), n=BIN_SIZE)[1:BIN_SIZE//2]
                freq = np.fft.fftfreq(BIN_SIZE, P)[1:BIN_SIZE//2]
                amp = np.abs(fft)
                energy = amp ** 2
                amp_norm = amp / BIN_SIZE
                energy_norm = energy / BIN_SIZE

                freq_max_amp = freq[np.argmax(amp_norm)]
                weight_amp_avg = np.sum(amp * freq) / np.sum(amp)
                weight_energy_avg = np.sum(energy * freq) / np.sum(energy)
                power_entropy = - np.sum((energy / np.sum(energy)) * np.log(energy / np.sum(energy)))
                FEATURES_FREQ.append((w, '{}-{}'.format('Freq. Max. Amp', var), freq_max_amp))
                FEATURES_FREQ.append((w, '{}-{}'.format('Weighted Avg. Amp', var), weight_amp_avg))
                FEATURES_FREQ.append((w, '{}-{}'.format('Weighted Avg. Energy', var), weight_energy_avg))
                FEATURES_FREQ.append((w, '{}-{}'.format('Power Spec. Entropy', var), power_entropy))
            else:
                FEATURES_FREQ.append((w, '{}-{}'.format('Freq. Max. Amp', var), np.nan))
                FEATURES_FREQ.append((w, '{}-{}'.format('Weighted Avg. Amp', var), np.nan))
                FEATURES_FREQ.append((w, '{}-{}'.format('Weighted Avg. Energy', var), np.nan))
                FEATURES_FREQ.append((w, '{}-{}'.format('Power Spec. Entropy', var), np.nan))

    FEATURES_FREQ = pd.DataFrame(FEATURES_FREQ, columns=['timestamps', 'feature', 'value'])
    FEATURES_FREQ = FEATURES_FREQ.pivot(index='timestamps', columns='feature', values='value').reset_index()
    featured_acc.append(FEATURES_FREQ)

In [0]:
for i in range(4):
    featured_acc[i].set_index('timestamps', drop=True, inplace=True)
    featured_rss[i].set_index('timestamps', drop=True, inplace=True)
    featured_gps[i].set_index('timestamps', drop=True, inplace=True)

In [0]:
for i in range(4):
    pd.concat([featured_acc[i], featured_rss[i], featured_gps[i]], axis=1, sort=False).to_csv(f'{path}/output/Data{i}.csv')

In [0]:
pd.read_csv(f'{path}/output/Data1.csv')

Unnamed: 0,timestamps,Freq. Max. Amp-x,Freq. Max. Amp-y,Freq. Max. Amp-z,Power Spec. Entropy-x,Power Spec. Entropy-y,Power Spec. Entropy-z,Weighted Avg. Amp-x,Weighted Avg. Amp-y,Weighted Avg. Amp-z,...,rss-2GHZ-MAX,rss-2GHZ-MEAN,rss-2GHZ-MIN,rss-2GHZ-STD,rss-5GHZ-MAX,rss-5GHZ-MEAN,rss-5GHZ-MIN,rss-5GHZ-STD,accuracy,satellites
0,1.574765e+12,0.421019,0.421019,0.421019,1.007134,0.205365,0.190349,1.784358,1.945713,1.544612,...,-48.0,-76.727273,-93.0,14.251577,-69.0,-80.642857,-94.0,8.173189,8.0,29.0
1,1.574765e+12,0.421019,0.421019,0.421019,0.801706,0.324261,0.155333,1.615358,1.284646,1.568717,...,,,,,,,,,8.0,29.0
2,1.574765e+12,0.421019,0.421019,0.421019,1.128230,0.234497,0.141259,1.665184,1.739788,1.623497,...,,,,,,,,,4.0,29.0
3,1.574765e+12,0.842039,0.421019,0.421019,1.085158,0.240752,0.196452,1.381500,1.607693,1.517893,...,,,,,,,,,4.0,29.0
4,1.574765e+12,0.421019,0.421019,0.421019,0.706371,0.210289,0.194802,1.249447,1.074657,1.629067,...,,,,,,,,,8.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,1.574766e+12,0.421019,0.421019,0.421019,0.339592,0.190871,0.124617,1.177897,1.577223,1.684086,...,,,,,,,,,20.0,21.5
2778,1.574766e+12,0.421019,0.421019,0.421019,0.734564,0.454358,0.178359,1.530347,1.212711,1.652691,...,,,,,,,,,20.0,23.0
2779,1.574766e+12,0.421019,0.842039,0.421019,0.294216,1.202583,0.346011,2.084356,2.038058,1.383292,...,,,,,,,,,20.0,23.0
2780,1.574766e+12,0.421019,0.421019,0.421019,0.577602,0.315167,0.160445,1.115864,1.422802,1.483604,...,,,,,,,,,20.0,23.0


In [0]:
import pandas as pd
data = pd.read_csv('/content/Data0.csv')
data.head()

Unnamed: 0,timestamps,Freq. Max. Amp-x,Freq. Max. Amp-y,Freq. Max. Amp-z,Power Spec. Entropy-x,Power Spec. Entropy-y,Power Spec. Entropy-z,Weighted Avg. Amp-x,Weighted Avg. Amp-y,Weighted Avg. Amp-z,Weighted Avg. Energy-x,Weighted Avg. Energy-y,Weighted Avg. Energy-z,rss-2GHZ-MAX,rss-2GHZ-MEAN,rss-2GHZ-MIN,rss-2GHZ-STD,rss-5GHZ-MAX,rss-5GHZ-MEAN,rss-5GHZ-MIN,rss-5GHZ-STD,accuracy,satellites
0,1574385000000.0,0.782329,0.782329,0.782329,0.813958,0.39289,0.305882,1.878761,3.916973,2.816101,1.099623,0.905402,0.856118,-67.0,-86.6,-94.0,6.627217,-87.0,-89.333333,-93.0,2.624669,20.0,25.0
1,1574385000000.0,0.782329,0.782329,0.782329,0.777437,0.417163,0.279525,2.537347,2.635022,2.468075,1.105481,0.892526,0.846623,,,,,,,,,20.0,25.0
2,1574385000000.0,0.782329,0.782329,0.782329,0.764845,0.304458,0.108691,3.370011,2.416517,3.091354,1.097041,0.853496,0.812313,,,,,,,,,20.0,25.0
3,1574385000000.0,0.782329,0.782329,0.782329,0.590041,0.14641,0.223807,2.747106,3.04888,3.230398,0.97241,0.834071,0.835829,,,,,,,,,20.0,24.0
4,1574385000000.0,0.782329,0.782329,0.782329,0.906496,0.425345,0.284674,3.713616,2.482708,2.748236,1.255077,0.895182,0.848815,,,,,,,,,20.0,24.0


In [0]:
label = pd.read_csv('/content/Transition.csv')
label.head()

Unnamed: 0,floor,timeMillis
0,-1,1574385249558
1,-5,1574385292404
2,2,1574385297110
3,7,1574385333303
4,8,1574385346916


In [0]:
DATA_LABELED = data.assign(label = 'Undefined')
state = 'OUT'
previous = 0

for row in label.iterrows():
    DATA_LABELED.loc[
        lambda x: (x.timestamps <= row[1].timeMillis) & (x.timestamps >= previous), 'label'
    ] = state

    previous = row[1].timeMillis

    if (row[1].floor in (-5, 1, 6)):
        state = 'IN'
    if (row[1].floor in (-3, 3, 8)):
        state = 'OUT'