# Import libraries

In [2]:
import pandas as pd
import numpy as np
import math
import os

# Current and parent directories

In [4]:
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)

# Read activity information

In [6]:
activities = pd.read_excel(str(parent_directory) + '/data/Aras-Information.xlsx', sheet_name='Activity-Info')
activities.head()

Unnamed: 0,Activity ID,Acitivity,Zone ID,Zone,Physical activity levels (M) (met),Scale of CO2 Generation for 21-30 age group (L/s),CO2 Emission by Occupant (L/s),CO2 Emission by Occupant (CFM),Heat Radiation by Occupant (kW),Heat Radiation by Occupant (W)
0,0,Other,0,Outside,0.0,-0.9961,0.0,0.0,0.0,0.0
1,1,Fill medication dispenser,3,Kitchen,2.8,0.0039,0.01092,0.023139,0.113268,113.267755
2,2,Hang up clothes,1,Bedroom,2.2,0.0039,0.00858,0.018181,0.088996,88.996093
3,3,Move the couch,2,Livingroom,4.0,0.0039,0.0156,0.033056,0.161811,161.811078
4,4,Sit on the couch,2,Livingroom,1.5,0.0039,0.00585,0.012396,0.060679,60.679154


# Accessing activity-zone mapping information

In [8]:
activity_zone_map = dict()
for i in range(len(activities)):
    activity_zone_map[int(activities["Activity ID"][i])] = int(activities["Zone ID"][i])

# Fixed parameters

In [10]:
NUM_DAYS = 16
NUM_ACTIVITIES = len(activities)

# Dataset cleaning function

In [12]:
def dataset_cleaning(input_directory, house_name, occupant_id, output_filename):
    
    records = []

    for day in range(1, NUM_DAYS + 1):    
        print("Cleaned ...", "House", house_name, "Day", day)
        # Importing dataset
        raw_data_filename = str(input_directory) + str(day) + '.txt'
        raw_dataframe =  pd.read_csv(str(raw_data_filename), header=None, sep = ' ').iloc[:, -2:]  

        # Establishing column names to make it easier to work with
        raw_dataframe.columns = ['Occ-1','Occ-2']

        # We will just work with the specified occupant in the function argument
        activities_occupant = raw_dataframe['Occ-' + str(occupant_id)].to_list()
        
        arrival_times_occupant = []
        zones_occupant = []
        zone_activities_occupant = []

        prev_zone_occupant = -1

        # Checking at each minute sampling time
        for second in range(0, len(activities_occupant) - 60, 60):
            minute = int(math.ceil(second / 60))
            current_zone_occupant = activity_zone_map[activities_occupant[second]]
            
            
            if prev_zone_occupant != current_zone_occupant:
                arrival_times_occupant.append(minute)
                zones_occupant.append(current_zone_occupant)
                zone_activities_occupant.append(activities_occupant[second])
                prev_zone_occupant = current_zone_occupant

            prev_activity_occupant = activities_occupant[second]

        exit_times_occupant = []

        for et in range(1, len(arrival_times_occupant)):
            exit_times_occupant.append(arrival_times_occupant[et] - 1)
        
        # The final exit time will be 1439 (last minute of the day since starting from 0 minutes)
        exit_times_occupant.append(1439)

        for et in range(len(arrival_times_occupant)):
            stay_duration = exit_times_occupant[et] - arrival_times_occupant[et] + 1
            if stay_duration == 0 and et != len(arrival_times_occupant) - 1:
                arrival_times_occupant[et + 1] -= 1
            else:
                records.append([day, zone_activities_occupant[et], zones_occupant[et], arrival_times_occupant[et], exit_times_occupant[et], stay_duration])      
    
    cleaned_dataframe = pd.DataFrame(columns = ['Day', 'Occupant\'s Activity', 'Occupant\'s Zone', 'Zone Arrival Time (Minute)', 'Zone Leaving Time (Minute)', 'Stay Duration (Minute)'], data = records)
    cleaned_dataframe.to_csv(output_filename, index=False)

# Creating cleaned dataframe for both houses and all occupants

In [14]:
for house_name in ['A']:
    for occupant_id in ['1', '2']:
        input_directory = str(parent_directory) + '\data\\raw\\house-' + house_name + '\\DAY-'
        output_filename = str(parent_directory) + '\data\\cleaned\\' + 'Cleaned-Dataframe_House-' + str(house_name) + '_Occupant-' + str(occupant_id) + '.csv'
        dataset_cleaning(input_directory, house_name, occupant_id, output_filename)

  input_directory = str(parent_directory) + '\data\\raw\\house-' + house_name + '\\DAY-'
  output_filename = str(parent_directory) + '\data\\cleaned\\' + 'Cleaned-Dataframe_House-' + str(house_name) + '_Occupant-' + str(occupant_id) + '.csv'


Cleaned ... House A Day 1
Cleaned ... House A Day 2
Cleaned ... House A Day 3
Cleaned ... House A Day 4
Cleaned ... House A Day 5
Cleaned ... House A Day 6
Cleaned ... House A Day 7
Cleaned ... House A Day 8
Cleaned ... House A Day 9
Cleaned ... House A Day 10
Cleaned ... House A Day 11
Cleaned ... House A Day 12
Cleaned ... House A Day 13
Cleaned ... House A Day 14
Cleaned ... House A Day 15
Cleaned ... House A Day 16
Cleaned ... House A Day 1
Cleaned ... House A Day 2
Cleaned ... House A Day 3
Cleaned ... House A Day 4
Cleaned ... House A Day 5
Cleaned ... House A Day 6
Cleaned ... House A Day 7
Cleaned ... House A Day 8
Cleaned ... House A Day 9
Cleaned ... House A Day 10
Cleaned ... House A Day 11
Cleaned ... House A Day 12
Cleaned ... House A Day 13
Cleaned ... House A Day 14
Cleaned ... House A Day 15
Cleaned ... House A Day 16
Cleaned ... House B Day 1


  input_directory = str(parent_directory) + '\data\\raw\\house-' + house_name + '\\DAY-'
  output_filename = str(parent_directory) + '\data\\cleaned\\' + 'Cleaned-Dataframe_House-' + str(house_name) + '_Occupant-' + str(occupant_id) + '.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\nurim\\Downloads\\casas-dataset\\data\\raw\\house-B\\DAY-1.txt'

# Cheacking stay durations are all non zero

In [None]:
for house_name in ['A']:
    for occupant_id in ['1', '2']:
        dataframe = pd.read_csv(str(parent_directory) + '\data\\cleaned\\' + 'Cleaned-Dataframe_House-' + str(house_name) + '_Occupant-' + str(occupant_id) + '.csv')
        # Check if all values in column 'Stay Duration (Minute)' are non-zero
        print("Checked House", house_name, "Occupant", occupant_id, "Dataframe... Is  it non zero?", (dataframe['Stay Duration (Minute)'] != 0).all())