# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import math

# Read Activity Information

In [2]:
activities = pd.read_excel('Aras-Information.xlsx', sheet_name='Activity-Info')

# Accessing Activity-Zone Mapping Information

In [3]:
activity_zone_map = dict()
for i in range(len(activities)):
    activity_zone_map[int(activities["Activity ID"][i])] = int(activities["Zone ID"][i])

# Constants

In [4]:
NUM_DAYS = 30
NUM_ACTIVITIES = len(activities)

# Dataset Cleaning Function

In [5]:
def dataset_cleaning(house, occupant, filename_cleaned_dataset):
    records = []

    for day in range(1, NUM_DAYS + 1):    
        # Importing dataset
        raw_filename = 'raw/house-' + house +'/DAY-' + str(day) + '.txt'
        raw_dataframe =  pd.read_csv(str(raw_filename), header=None, sep = ' ').iloc[:, -2:]

        raw_dataframe.columns = ['Occ-1', 'Occ-2']

        if occupant == 1:
            activities_occupant = raw_dataframe['Occ-1'].to_list()
        elif occupant == 2:
            activities_occupant = raw_dataframe['Occ-2'].to_list()

        arrival_times_occupant = []
        zones_occupant = []
        zone_activities_occupant = []

        prev_zone_occupant = -1

        for second in range(0, len(activities_occupant) - 60, 60):
            minute = int(math.ceil(second / 60))
            current_zone_occupant = activity_zone_map[activities_occupant[second]]

            #print(current_zone_occupant, prev_zone_occupant)

            if prev_zone_occupant != current_zone_occupant:
                arrival_times_occupant.append(minute)
                zones_occupant.append(current_zone_occupant)
                zone_activities_occupant.append(activities_occupant[second])
                prev_zone_occupant = current_zone_occupant

            prev_activity_occupant = activities_occupant[second]

        exit_times_occupant = []

        for et in range(1, len(arrival_times_occupant)):
            exit_times_occupant.append(arrival_times_occupant[et] - 1)
        exit_times_occupant.append(1439)

        for et in range(len(arrival_times_occupant)):
            stay_duration = exit_times_occupant[et] - arrival_times_occupant[et]
            if stay_duration == 0 and et != len(arrival_times_occupant) - 1:
                arrival_times_occupant[et + 1] -= 1
            else:
                records.append([day, zone_activities_occupant[et], zones_occupant[et], arrival_times_occupant[et], exit_times_occupant[et], stay_duration])      
    cleaned_dataset = pd.DataFrame(columns = ['Day', 'Occupant\'s Activity', 'Occupant\'s Zone', 'Zone Arrival Time (Minute)', 'Zone Leaving Time (Minute)', 'Stay Duration (Minute)'], data = records)
    cleaned_dataset.to_csv(filename_cleaned_dataset, index=False)

# Saving Cleaned Datasets

In [6]:
dataset_cleaning('A', 1, 'cleaned/Cleaned-Dataframe_House-A_Occupant-1.csv')
dataset_cleaning('A', 2, 'cleaned/Cleaned-Dataframe_House-A_Occupant-2.csv')
dataset_cleaning('B', 1, 'cleaned/Cleaned-Dataframe_House-B_Occupant-1.csv')
dataset_cleaning('B', 2, 'cleaned/Cleaned-Dataframe_House-B_Occupant-2.csv')