In [1]:
# refer Vittorio's code to generate a dataset of labelled indoor/outdoor/commuting data 
# from the datasets from John Palmer's indoor/outdoor classification 
# and Mihai Visuian's commuting classification

In [2]:
import pandas as pd
import numpy as np
from os import path
import helper_functions_io as hf
import os
import data_process as dp
data_dir = "./data/indoor_outdoor"
bins = ['bin0', 'bin1', 'bin2', 'bin3', 'bin4', 'bin5', 'bin6', 'bin7', 'bin8',
       'bin9', 'bin10', 'bin11', 'bin12', 'bin13', 'bin14', 'bin15']

In [165]:
# Load in John and Mihai's data and reset the index
john_train = pd.read_csv(path.join(data_dir, 'john_train.csv'))
john_test = pd.read_csv(path.join(data_dir, 'john_test.csv'))
john_data = pd.concat([john_train, john_test])
mihai_data = pd.read_csv(path.join(data_dir,'mihai_training_data_gps_acc.csv'))
john_data.drop('Unnamed: 0',axis=1, inplace=True)
mihai_data.drop('Unnamed: 0',axis=1, inplace=True)
john_data.reset_index(drop=True, inplace=True)
mihai_data.reset_index(drop=True, inplace=True)

In [166]:
mihai_data.head()

Unnamed: 0,bin0,bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8,bin9,...,pm10,environment_index,gpsLatitude,gpsLongitude,gpsAccuracy,humidity,temperature,phoneTimestamp,file label,total
0,631,230,75,35,20,17,14,2,0,0,...,8.180582,2,53.36033,-2.342155,8.4,59.0,26.300001,1507895577439,10,1024
1,1713,593,302,176,79,55,33,4,3,0,...,28.392464,2,53.357458,-2.341658,4.0,58.7,26.300001,1507895597696,10,2958
2,700,256,109,57,28,29,16,8,0,2,...,16.042948,2,53.343682,-2.337317,3.9,58.5,26.300001,1507895761413,10,1205
3,523,209,52,38,8,20,5,0,2,0,...,6.765331,2,53.339758,-2.33573,3.9,58.2,26.4,1507895782200,10,857
4,409,157,46,14,5,11,6,3,2,0,...,6.702554,2,53.335658,-2.334128,3.9,58.0,26.4,1507895804320,10,653


In [167]:
mihai_data.shape

(2870, 28)

In [168]:
# Normalise Mihai's bin data so that the format of both are the same.
mihai_data[bins] = mihai_data[bins].div(mihai_data[bins].sum(axis=1), axis=0)

# Rename columns so they are the same in both
mihai_data.rename(columns={'phoneTimestamp':'timestamp','environment_index':'i/o'}, inplace=True)

# Change Mihai's lables all to 1 indicating commuting
mihai_data['i/o'] = 2

# Make columns consistent over dataframes
cols = john_data.columns.intersection(mihai_data.columns)
mihai_data = mihai_data[cols]
john_data_n = john_data[cols]

In [169]:
john_data.columns.intersection(mihai_data.columns)

Index(['timestamp', 'pm1', 'pm2_5', 'pm10', 'temperature', 'humidity', 'bin0',
       'bin1', 'bin2', 'bin3', 'bin4', 'bin5', 'bin6', 'bin7', 'bin8', 'bin9',
       'bin10', 'bin11', 'bin12', 'bin13', 'bin14', 'bin15', 'total',
       'gpsLongitude', 'gpsLatitude', 'gpsAccuracy', 'i/o', 'file label'],
      dtype='object')

In [170]:
np.unique(mihai_data['file label'], return_counts=True)

(array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
 array([ 35, 241, 251, 432, 289, 146, 893, 150, 243, 190]))

In [171]:
np.unique(john_data['file label'], return_counts=True)

(array([ 2,  6,  8, 25]), array([  32, 1960,  181, 2109]))

In [172]:
# Concat data together to make one large dataset
full_data = pd.concat([mihai_data, john_data_n])
full_data.reset_index(drop=True, inplace=True)
full_data.head()

Unnamed: 0,timestamp,pm1,pm2_5,pm10,temperature,humidity,bin0,bin1,bin2,bin3,...,bin12,bin13,bin14,bin15,total,gpsLongitude,gpsLatitude,gpsAccuracy,i/o,file label
0,1507895577439,2.205355,5.798239,8.180582,26.300001,59.0,0.616211,0.224609,0.073242,0.03418,...,0.0,0.0,0.0,0.0,1024.0,-2.342155,53.36033,8.4,2,10
1,1507895597696,7.403782,19.711422,28.392464,26.300001,58.7,0.579108,0.200473,0.102096,0.0595,...,0.0,0.0,0.0,0.0,2958.0,-2.341658,53.357458,4.0,2,10
2,1507895761413,2.655042,8.001555,16.042948,26.300001,58.5,0.580913,0.212448,0.090456,0.047303,...,0.0,0.0,0.0,0.0,1205.0,-2.337317,53.343682,3.9,2,10
3,1507895782200,1.79913,4.281383,6.765331,26.4,58.2,0.610268,0.243874,0.060677,0.044341,...,0.0,0.0,0.0,0.0,857.0,-2.33573,53.339758,3.9,2,10
4,1507895804320,1.281362,3.25158,6.702554,26.4,58.0,0.62634,0.240429,0.070444,0.02144,...,0.0,0.0,0.0,0.0,653.0,-2.334128,53.335658,3.9,2,10


In [173]:
full_data.shape

(7152, 28)

In [174]:
# Convert data to csv
#full_data.to_csv(path.join(data_dir, "indoor_outdoor_commuting_data.csv"), index=False)
# count of indoor / outdoor / commuting data: 2141 / 2141 / 2870

In [175]:
np.unique(full_data['i/o'],return_counts=True)

(array([0, 1, 2]), array([2141, 2141, 2870]))

In [176]:
# Produce new Dataset, John Indoor as Indoor. Stationary Sensors as outdoor. Mihai Data as commuting.

new_indoor = john_data[john_data['i/o'] == 0]
new_commuting = mihai_data

cols = new_indoor.columns
#static_cols = cols.delete([22,23,24])
#static_cols

In [177]:
new_commuting = new_commuting.dropna()

In [178]:
cols = ['timestamp', 'pm1', 'pm2_5', 'pm10', 'temperature', 'humidity', 'bin0',
       'bin1', 'bin2', 'bin3', 'bin4', 'bin5', 'bin6', 'bin7', 'bin8', 'bin9',
       'bin10', 'bin11', 'bin12', 'bin13', 'bin14', 'bin15',
       'gpsLongitude', 'gpsLatitude', 'gpsAccuracy', 'file label', 'i/o']

In [179]:
new_data_set = pd.concat([new_indoor, new_commuting])
np.unique(new_data_set['i/o'], return_counts=True)
# 0: indoor; 1: outdoor; 2: commuting

(array([0, 2]), array([2141, 2410]))

In [180]:
new_data_set.loc[new_data_set['i/o'].isin([2,1]), 'i/o'] = 1 # commuting 

In [181]:
new_data_set = new_data_set[cols]
new_data_set.shape

(4551, 27)

In [182]:
new_data_set.head()

Unnamed: 0,timestamp,pm1,pm2_5,pm10,temperature,humidity,bin0,bin1,bin2,bin3,...,bin11,bin12,bin13,bin14,bin15,gpsLongitude,gpsLatitude,gpsAccuracy,file label,i/o
26,1518035457738,1.245806,1.590959,1.605238,24.7,23.7,0.635514,0.17757,0.140187,0.028037,...,0.0,0.0,0.0,0.0,0.0,-3.183433,55.944689,21.232,6,0
27,1518038866288,0.558534,0.773462,0.949025,24.1,23.800001,0.617021,0.12766,0.191489,0.042553,...,0.0,0.0,0.0,0.0,0.0,-3.183416,55.944687,20.036,6,0
28,1518039131629,0.347164,0.445047,0.461267,24.300001,24.0,0.71875,0.15625,0.03125,0.03125,...,0.0,0.0,0.0,0.0,0.0,-3.183438,55.94469,21.311,6,0
29,1518041088275,0.46605,0.630702,0.819997,22.7,25.4,0.7,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,-3.183397,55.944689,20.114,6,0
30,1518046406113,3.238715,5.534687,13.598838,26.0,36.5,0.433213,0.176895,0.173285,0.079422,...,0.0,0.0,0.0,0.0,0.0,-3.183374,55.944709,23.329,6,0


In [183]:
#  Calculate the Euclidean distance between two GPS points based on the longitude and latitude
#  :param data: DataFrame --> Needs to include gpsLongitude and gpsLatitude features
#  :return: DataFrame with gps_distance included as additional feature

#    calculate_std(data, column_name, k=10):

#    Calculates the standard deviation of a given columns
#    :param data: DataFrame --> data
#    :param column_name: String --> Column name for the standard deviation is to be calculated
#    :param k: Int --> Window size
#    :return: DataFrame --> Data with additional column for the standard deviation (column_name_std)

data = dp.distance_euclidean(new_data_set) # add gps_distance feature
data = dp.calculate_std(data, 'gps_dist') # add gps_dist_std feature

In [184]:
def min_max_norm(data, col):
    target_col = data[col]
    max_num = max(target_col.dropna())
    min_num = min(target_col.dropna())
    std = (target_col - min_num) / (max_num - min_num)
    data[col] = std
    
    return data

In [185]:
data = min_max_norm(data, 'temperature')

In [186]:
data = min_max_norm(data, 'humidity')

In [187]:
data.head()

Unnamed: 0,timestamp,pm1,pm2_5,pm10,temperature,humidity,bin0,bin1,bin2,bin3,...,bin13,bin14,bin15,gpsLongitude,gpsLatitude,gpsAccuracy,file label,i/o,gps_dist,gps_dist_std
26,1518035457738,1.245806,1.590959,1.605238,0.721408,0.08815,0.635514,0.17757,0.140187,0.028037,...,0.0,0.0,0.0,-3.183433,55.944689,21.232,6,0,0.0,0.0
27,1518038866288,0.558534,0.773462,0.949025,0.703812,0.089595,0.617021,0.12766,0.191489,0.042553,...,0.0,0.0,0.0,-3.183416,55.944687,20.036,6,0,1.8e-05,0.0
28,1518039131629,0.347164,0.445047,0.461267,0.709677,0.092486,0.71875,0.15625,0.03125,0.03125,...,0.0,0.0,0.0,-3.183438,55.94469,21.311,6,0,2.3e-05,0.0
29,1518041088275,0.46605,0.630702,0.819997,0.662757,0.112717,0.7,0.125,0.125,0.0,...,0.0,0.0,0.0,-3.183397,55.944689,20.114,6,0,4.1e-05,0.0
30,1518046406113,3.238715,5.534687,13.598838,0.759531,0.273121,0.433213,0.176895,0.173285,0.079422,...,0.0,0.0,0.0,-3.183374,55.944709,23.329,6,0,3e-05,0.0


In [188]:
data.to_csv(path.join(data_dir, "new_data_set.csv"), index=False)