In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import bootstrap_plot
from sklearn import preprocessing
import datetime
import numpy as np

seconds = "5s"

path = "./src/Data/Test"

all_files = []
for root,d_names,f_names in os.walk(path):
    for f in f_names:
        all_files.append(os.path.join(root, f))
#print("all files = %s" %all_files)

def getDFForDataSource(data_source):
    file_list_a = []
    for f in all_files:
        if "A-" + data_source + ".csv" in f:
            file_list_a.append(f)
    df_a = pd.concat([pd.read_csv(f,  delimiter=";") for f in file_list_a ], sort=True)
    file_list_b = []
    for f in all_files:
        if "B-" + data_source + ".csv" in f:
            file_list_b.append(f)
    df_b = pd.concat([pd.read_csv(f,  delimiter=";") for f in file_list_b ], sort=True)
    file_list_c = []
    for f in all_files:
        if "C-" + data_source + ".csv" in f:
            file_list_c.append(f)
    df_c = pd.concat([pd.read_csv(f,  delimiter=";") for f in file_list_c ], sort=True)
    df_a["segment"] = "A"
    df_b["segment"] = "B"
    df_c["segment"] = "C"
    return pd.concat([df_a, df_b, df_c], sort=True)


# get data frame for each data source from CSV files
proximity_df = getDFForDataSource("proximity")
sensors_df = getDFForDataSource("sensors")
acceleration_df = getDFForDataSource("acceleration")
floor_df = getDFForDataSource("floor")

activities = pd.read_csv("./src/labels_testing_set.csv",  delimiter=",")
activities["TIMESTAMP"] = pd.to_datetime(activities["TIMESTAMP"])
print(activities.head())

# total of samples
print(len(floor_df))
print(len(activities))
print(len(sensors_df))
print(len(acceleration_df))
print(len(proximity_df))


            TIMESTAMP     ACTIVITY SEGMENT
0 2017-11-09 12:02:30  no activity       A
1 2017-11-09 12:03:00  no activity       A
2 2017-11-09 12:03:30        Act24       A
3 2017-11-09 12:04:00        Act24       A
4 2017-11-09 12:04:30        Act24       A
37035
535
1115
774243
3908


In [2]:
activities = activities.set_index("TIMESTAMP")

activities_resampled_in_sec = activities.resample("1s").ffill() # propagate last valid observation forward to next valid



In [3]:
# prepare dfs to similar column format


# proximity df
proximity_df_resampled = proximity_df.copy()
proximity_df_resampled.drop(["ID"], axis=1, inplace=True)
proximity_objects = proximity_df_resampled["OBJECT"].unique()

proximity_df_resampled = proximity_df_resampled.dropna()
proximity_objects = proximity_df_resampled["OBJECT"].unique()

proximity_df_resampled = pd.concat([proximity_df_resampled, pd.DataFrame(columns=proximity_objects)], ignore_index=True, sort=True)
for index, row in proximity_df_resampled.iterrows():
    object = row["OBJECT"]
    # https://www.speedguide.net/faq/how-does-rssi-dbm-relate-to-signal-quality-percent-439
    if(row["RSSI"] >= -97):
        rssi = 1
    else:
        rssi = 0
    proximity_df_resampled.loc[index, object] = rssi
proximity_df_resampled.drop(["OBJECT"], axis=1, inplace=True)
proximity_df_resampled.drop(["RSSI"], axis=1, inplace=True)
proximity_df_resampled["TIMESTAMP"] = pd.to_datetime(proximity_df_resampled["TIMESTAMP"])



# sensors df
sensors_df_resampled = sensors_df.copy()
sensors_df_resampled.drop(["HABITANT"], axis=1, inplace=True)
sensors_objects = ['SM4', 'C14', 'D07', 'C10', 'C09', 'SM3', 'SM1', 'D04', 'D01', 'D10', 'D02', 'D03',
 'C13', 'M01', 'C08', 'C12', 'D09', 'C04', 'C07', 'H01', 'D08', 'TV0', 'S09', 'SM5',
 'C02', 'C01', 'D05', 'C05', 'C03', 'C015']
sensors_df_resampled = pd.concat([sensors_df_resampled, pd.DataFrame(columns=sensors_objects)], ignore_index=True, sort=True)
for index, row in sensors_df_resampled.iterrows():
    object = row["OBJECT"]
    # preprocess the state column: "open", "movement" and "pressure" = 1, else 0
    if row["STATE"] == "Movement" or row["STATE"] == "Pressure" or row["STATE"] == "Open":
        sensors_df_resampled.loc[index, object] = 1
    else: 
        sensors_df_resampled.loc[index, object] = 0
sensors_df_resampled.drop(["OBJECT"], axis=1, inplace=True)
sensors_df_resampled.drop(["STATE"], axis=1, inplace=True)
sensors_df_resampled["TIMESTAMP"] = pd.to_datetime(sensors_df_resampled["TIMESTAMP"])
sensors_df_resampled.fillna(method='ffill', inplace=True)
sensors_df_resampled = sensors_df_resampled.fillna(0)




# acceleration df
acceleration_df_resampled = acceleration_df.copy()
acceleration_df_resampled["TIMESTAMP"] = pd.to_datetime(acceleration_df_resampled["TIMESTAMP"])


# floor df
floor_df_temp = floor_df[["TIMESTAMP", "DEVICE"]].copy()
# dropping duplicate values 
floor_df_temp.drop_duplicates(keep=False,inplace=True) 
# one hot encoding the nominal device data 
non_dummy_cols = ['TIMESTAMP']
dummy_cols = list(set(floor_df_temp.columns) - set(non_dummy_cols))
floor_df_resampled = pd.get_dummies(floor_df_temp, columns=dummy_cols, prefix=['floor'])

proximity_df_resampled["TIMESTAMP"] = pd.to_datetime(proximity_df_resampled["TIMESTAMP"]).dt.floor('1s')
sensors_df_resampled["TIMESTAMP"] = pd.to_datetime(sensors_df_resampled["TIMESTAMP"]).dt.floor('1s')
acceleration_df_resampled["TIMESTAMP"] = pd.to_datetime(acceleration_df_resampled["TIMESTAMP"]).dt.floor('1s')
floor_df_resampled["TIMESTAMP"] = pd.to_datetime(floor_df_resampled["TIMESTAMP"]).dt.floor('1s')




proximity_df_resampled.drop(["segment"], axis=1, inplace=True)
sensors_df_resampled.drop(["segment"], axis=1, inplace=True)
acceleration_df_resampled.drop(["segment"], axis=1, inplace=True)




proximity_df_resampled = proximity_df_resampled.set_index("TIMESTAMP")
sensors_df_resampled = sensors_df_resampled.set_index("TIMESTAMP")
acceleration_df_resampled = acceleration_df_resampled.set_index("TIMESTAMP")
floor_df_resampled = floor_df_resampled.set_index("TIMESTAMP")


In [4]:
# convert sensor data into n sec slots, calculate mean if there are multiple entries for the same sensor data in the same time slot
# input: df with columns timestamp, segment, columns for each feature


proximity_df_resampled = proximity_df_resampled.fillna(0).astype(float)
sensors_df_resampled = sensors_df_resampled.fillna(0.0).astype(float)
acceleration_df_resampled = acceleration_df_resampled.astype(float)

# resample in seconds
# time-based overlapping sliding window technique with n sec overlap
# see doc: https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html
# for binary: max
# mean for rest

proximity_df_resampled_in_sec = proximity_df_resampled.resample(seconds).max()
sensors_df_resampled_in_sec = sensors_df_resampled.resample(seconds).max()
acceleration_df_resampled_in_sec = acceleration_df_resampled.resample(seconds).mean()
floor_df_resampled_in_sec = floor_df_resampled.resample(seconds).max()


# drop nans
proximity_df_resampled_in_sec = proximity_df_resampled_in_sec.dropna(how='all')
sensors_df_resampled_in_sec = sensors_df_resampled_in_sec.dropna(how='all')
acceleration_df_resampled_in_sec = acceleration_df_resampled_in_sec.dropna(how='all')
floor_df_resampled_in_sec = floor_df_resampled_in_sec.dropna(how='all')





print(len(proximity_df_resampled_in_sec))
print(len(sensors_df_resampled_in_sec))
print(len(acceleration_df_resampled_in_sec))
print(len(floor_df_resampled_in_sec))


1908
745
2711
1669


In [5]:
# merge floor data into areas to reduce features
# areas: entrance room, living room, bedroom, kitchen

room_mapping = {'floor_01,01': 'living room', 
                'floor_01,02': 'living room',  
                'floor_01,03': 'living room',  
                'floor_01,04': 'living room',
                'floor_01,05': 'living room',
                'floor_01,06': 'living room',
                'floor_01,07': 'living room',
                'floor_01,08': 'entrance',
                'floor_01,09': 'entrance',
                'floor_02,01': 'living room',
                'floor_02,02': 'living room',
                'floor_02,03': 'living room',
                'floor_02,04': 'living room',
                'floor_02,05': 'living room',
                'floor_02,06': 'living room',
                'floor_02,07': 'living room',
                'floor_02,08': 'entrance',
                'floor_02,09': 'entrance',
                'floor_02,10': 'entrance',
                'floor_03,01': 'bed room',
                'floor_03,02': 'bed room',
                'floor_03,03': 'bed room',
                'floor_03,04': 'bed room',
                'floor_03,05': 'bed room',
                'floor_03,06': 'kitchen',
                'floor_03,07': 'kitchen',
                'floor_03,08': 'kitchen',
                'floor_03,09': 'kitchen',
                'floor_04,01': 'bed room',
                'floor_04,02': 'bed room',
                'floor_04,03': 'bed room',
                'floor_04,04': 'bed room',
                'floor_04,05': 'bed room',
                'floor_04,06': 'kitchen',
                'floor_04,07': 'kitchen',
                'floor_04,08': 'kitchen',
                'floor_04,09': 'kitchen',
                'floor_05,06': 'kitchen',
                'floor_05,07': 'kitchen'             
                } 
room_df = floor_df_resampled_in_sec.groupby(room_mapping, axis = 1).max()
#room = pd.DataFrame(columns=["TIMESTAMP", "entrance", "living room", "bedroom", "kitchen"])


In [6]:
# merge sensor timeslots with 2 sec with columns for every sensor (= feature)
# add sensor data from all 4 sources


# https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range



# Pre-Processing of data

# normalisation
def normalise(df):
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    column_names_to_not_normalize = ["TIMESTAMP"]
    column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]
    x = df[column_names_to_normalize].values
    x_scaled = min_max_scaler.fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
    df[column_names_to_normalize] = df_temp
    return df


# normalise features of proximity and acceleration (rescale between values of 0 and 1)
acceleration_df_resampled_in_sec = normalise(acceleration_df_resampled_in_sec)


# TODO: add commonly used features for acceleration sensor data features: Mean, Variance, Skewness, and Kurtosis sklearn



merged_sensor_data = proximity_df_resampled_in_sec.combine_first(acceleration_df_resampled_in_sec)\
    .combine_first(sensors_df_resampled_in_sec).combine_first(floor_df_resampled_in_sec)#room_df)



activities_resampled_in_sec["SEGMENT"] = activities_resampled_in_sec["SEGMENT"].map({"A": 0, "B": 1, "C": 2})
activity_map = {"Act01": 1, "Act02": 2, "Act03": 3, "Act04": 4, "Act05": 5, "Act06": 6, "Act07": 7, 
                "Act08": 8, "Act09": 9,  "Act10": 10, "Act11": 11, "Act12": 12, "Act13": 13, 
                "Act14": 14,  "Act15": 15, "Act16": 16, "Act17": 17, "Act18": 18, "Act19": 19, 
                "Act20": 20, "Act21": 21, "Act22": 22, "Act23": 23, "Act24": 24, "no activity": 0}
activities_resampled_in_sec["ACTIVITY"] = activities_resampled_in_sec["ACTIVITY"].map(activity_map)


activities_resampled_in_sec = activities_resampled_in_sec.resample(seconds).median()
#activities_resampled_in_sec = activities_resampled_in_sec.reset_index()


samples_training = pd.merge(merged_sensor_data, activities_resampled_in_sec, on="TIMESTAMP", how='left')

# deal with missing data for 1 sec slots:
# add activity labels to days ("Act02" or "no activity") and segment
# if nan, then "no activity"
samples_training["ACTIVITY"] = samples_training["ACTIVITY"].fillna(0)
# get segment for nans
samples_training["SEGMENT"].fillna(method='backfill', inplace=True)
# filling acceleration data with median of column

samples_training["X"].fillna(method='ffill', inplace=True) # ffill: propagate last valid observation forward to next valid
samples_training["Y"].fillna(method='ffill', inplace=True)
samples_training["Z"].fillna(method='ffill', inplace=True)
# filling missing proximity data and missing sensors data with 0
samples_training.fillna(0.0, inplace=True)

# normalisation
def normalise_whole_df(df):
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    column_names_to_not_normalize = ["TIMESTAMP", "ACTIVITY"]
    column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]
    x = df[column_names_to_normalize].values
    x_scaled = min_max_scaler.fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
    df[column_names_to_normalize] = df_temp
    return df


# normalise features of segment and acceleration (rescale between values of 0 and 1)
samples_training = normalise_whole_df(samples_training)

print("Testing sample result is: ")
print(samples_training.head())
print(len(samples_training))

print(samples_training.groupby("ACTIVITY").count())












Testing sample result is: 
                     BATHROOM TAP  BED  BOOK  C01  C015  C02  C03  C04  C05  \
TIMESTAMP                                                                     
2017-11-09 12:02:10           0.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0   
2017-11-09 12:02:15           0.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0   
2017-11-09 12:02:20           0.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0   
2017-11-09 12:02:25           0.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0   
2017-11-09 12:02:30           0.0  1.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0   

                     C07   ...     floor_04,04  floor_04,05  floor_04,06  \
TIMESTAMP                  ...                                             
2017-11-09 12:02:10  0.0   ...             0.0          0.0          0.0   
2017-11-09 12:02:15  0.0   ...             0.0          0.0          0.0   
2017-11-09 12:02:20  0.0   ...             0.0          0.0          0.0   
2017-11-09 12:02:25  0.0   ...         

In [7]:
# map strings to numbers


sensors_map = {"M01": "Door Sensor", "TV0": "TV Sensor", "SM1": "Motion Sensor Kitchen", "SM3": "Motion Sensor Bathroom", 
               "SM4": "Motion Sensor Bedroom", "C01": "Medication Box Sensor", "SM5": "Motion Sensor Sofa", "D01": "Refrigerator Sensor", 
               "D02": "Microwave  Sensor", "D03": "Wardrobe Sensor", "D04": "Cupboard Cups Sensor", "D05": "Dishwasher Sensor", "D07": "Top WC Sensor", 
               "D08": "Closet Sensor", "D09": "Washing Machine  Sensor", "D10": "Pantry Sensor", "H01": "Kettle Sensor", "C02": "Fruit Platter Sensor", 
               "C03": "Cutlery Sensor", "C04": "Pots Sensor", "C05": "Water Bottle Sensor", "C07": "Remote XBOX Sensor", "C08": "Trash Sensor", 
               "C09": "Tap Sensor", "C10": "Tank Sensor", "C12": "Laundry Basket Sensor", "C13": "Pyjamas drawer Sensor", "C14": "Bed Sensor",
               "C015": "Contact Sensor Kitchen Faucet", "S09": "Sofa Pressure Sensor"}
samples_training = samples_training.rename(index=str, columns=sensors_map)


# # drop useless proximity features
samples_training = samples_training.drop(["Contact Sensor Kitchen Faucet"], axis=1)
samples_training = samples_training.drop(["Remote XBOX Sensor"], axis=1)
samples_training = samples_training.drop(["Cutlery Sensor"], axis=1)
samples_training = samples_training.drop(["Laundry Basket Sensor"], axis=1)

# # dropping useless features according to threshold variance analysis of training data
# samples_training = samples_training.drop(["TV Sensor"], axis=1)
# samples_training = samples_training.drop(["Dishwasher Sensor"], axis=1)
# samples_training = samples_training.drop(["Top WC Sensor"], axis=1)
# samples_training = samples_training.drop(["Closet Sensor"], axis=1)
# samples_training = samples_training.drop(["Washing Machine  Sensor"], axis=1)
# samples_training = samples_training.drop(["Kettle Sensor"], axis=1)
# samples_training = samples_training.drop(["Fruit Platter Sensor"], axis=1)
# samples_training = samples_training.drop(["Pots Sensor"], axis=1)
# samples_training = samples_training.drop(["Water Bottle Sensor"], axis=1)
# samples_training = samples_training.drop(["Trash Sensor"], axis=1)
# samples_training = samples_training.drop(["Tank Sensor"], axis=1)
# samples_training = samples_training.drop(["Pyjamas drawer Sensor"], axis=1)
# samples_training = samples_training.drop(["Medication Box Sensor"], axis=1)
# samples_training = samples_training.drop(["Cupboard Cups Sensor"], axis=1)

# drop errors from floor data
samples_training = samples_training.drop(["floor_01,0A"], axis=1)
samples_training = samples_training.drop(["floor_02,0A"], axis=1)








samples_training = samples_training.reset_index()

print(samples_training.head())
print(samples_training.columns)
print(samples_training.info())

y = samples_training["ACTIVITY"]
samples_training = samples_training.drop(["ACTIVITY"], axis=1)
samples_training["activity"] = y.astype(int)
samples_training = samples_training.rename(index=str, columns={"SEGMENT": "segment"})

# store data into CSV
samples_training.to_csv("./src/samples-testing.csv", sep=';', encoding='utf-8', index=False)

             TIMESTAMP  BATHROOM TAP  BED  BOOK  Medication Box Sensor  \
0  2017-11-09 12:02:10           0.0  0.0   0.0                    0.0   
1  2017-11-09 12:02:15           0.0  0.0   0.0                    0.0   
2  2017-11-09 12:02:20           0.0  0.0   0.0                    0.0   
3  2017-11-09 12:02:25           0.0  0.0   0.0                    0.0   
4  2017-11-09 12:02:30           0.0  1.0   0.0                    0.0   

   Fruit Platter Sensor  Pots Sensor  Water Bottle Sensor  Trash Sensor  \
0                   0.0          0.0                  0.0           0.0   
1                   0.0          0.0                  0.0           0.0   
2                   0.0          0.0                  0.0           0.0   
3                   0.0          0.0                  0.0           0.0   
4                   0.0          0.0                  0.0           0.0   

   Tap Sensor   ...     floor_04,04  floor_04,05  floor_04,06  floor_04,07  \
0         0.0   ...       