### Performance Metadata
* Runtime: 38 mins
* Computer specs: 32GB RAM and Intel Core i7-12700H 2.30 GHz Processor

In [1]:
from msilib import Directory
import os
import datetime
import pandas as pd
from decimal import Decimal

pd.set_option('display.precision', 10)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 150)

### Function
* Convert Unix timestamp to UTC datetime

In [2]:
def convert_unix_to_utc_datetime(unix_time):
    return datetime.datetime.utcfromtimestamp(float(unix_time))

### Function
* Receives dataframe (i.e. channel_x, mains)
1. Create column to store elapsed time in between each reading
2. Handle null values: set elpased time to 6 seconds
3. Convert elpased time to an float64
4. Convert W/x_secs to Wh and add column to store
* Return modified dataframe

In [3]:
def Wh(df):

    # Find how long readings represent (6-8 secs)
    df['Elapsed Time'] = df['DateTime'].diff()

    # Handle null values
    default_timedelta = pd.to_timedelta('0 days 00:00:06')
    df['Elapsed Time'].fillna(default_timedelta, inplace=True)

    # Convert seconds to int & find kWh
    df['Elapsed Time'] = df['Elapsed Time'].dt.total_seconds().astype(int)
    df['sec/hour'] = df['Elapsed Time'] / 3600
    df['Wh'] = df['Watts'] * df['sec/hour']

    return df

### Function
* Receieves input_directory (UK-DALE DB) and output_directory (where user chooses to processed data)
1. Read file into pandas dataframe
2. Select timeframe to keep data
    * Some devices were removed or added at different time points, so a timeframe is chosen where there is no changes 
3. Call Wh function
4. Change to 15 min readings: sum readings within every 15 min timeframe
5. Export dataframe as csv file
* Remove any 'print' when runnning, this is just to display in GitHub Repo
* When running also uncomment last 2 lines: 'file_path = output_file' & 'df.to_csv(file_path, index=True)'

In [4]:
def clean_data(input_file, output_file):

    # Import file
    file_path = input_file

    column_names = ["DateTime", "Watts"]
    df = pd.read_csv(file_path, delimiter=' ', usecols=[0,1], converters={0: convert_unix_to_utc_datetime}, header=None, names=column_names)

    print('INITIAL INGESTION')
    print(df.head())
    print('\n'*3)

    
    
    # Select timeframe below
    start_date = pd.Timestamp('2013-04-12')
    end_date = pd.Timestamp('2015-01-05')
    df = df[(df['DateTime'] >= start_date) & (df['DateTime'] <= end_date)]

    # Watt to wH
    df = Wh(df)

    print('CONVERT TO Wh')
    print(df.head(150))
    print('\n'*3)

    
    
    # Change to 15 min readings
    df.set_index('DateTime', inplace=True)
    df = df.resample('15T').sum()

    print('CHANGE TO 15 min READINGS')
    print(df.head())
    print('\n'*3)

    

    # Export file
    #file_path = output_file
    #df.to_csv(file_path, index=True)

### Cycle through files in UK-DALE DB
* Only go through .dat files (disregards README.txt)
* Does not go through 'channel_x_button_press' files
* Does not go through 'labels' file
* Remove 'break' when running, this is just to display in GitHub Repo

In [5]:
# Change directories below
input_directory = "../UK-DALE DB/house_1"
output_directory = "../UK-DALE CLEAN/H1"


for filename in os.listdir(input_directory):
    
    if filename.endswith(".dat"):

        if ('button_press' not in filename) and (filename != 'labels.dat'):
            
            # INPUT FILENAME
            input_file = input_directory + '/' + filename

            # OUTPUT FILENAME: change .dat to .csv (i.e. 'channel_3.dat' to 'channel_3.csv')
            file_root, file_extension = os.path.splitext(filename)
            out_filename = file_root + '.csv'
            output_file = output_directory + '/' + out_filename
            
            clean_data(input_file,output_file)

    break

INITIAL INGESTION
             DateTime  Watts
0 2012-11-09 22:28:15    599
1 2012-11-09 22:28:21    582
2 2012-11-09 22:28:27    600
3 2012-11-09 22:28:33    586
4 2012-11-09 22:28:40    596




CONVERT TO Wh
                   DateTime  Watts  Elapsed Time      sec/hour            Wh
1467642 2013-04-12 00:00:03    167             6  0.0016666667  0.2783333333
1467643 2013-04-12 00:00:09    167             6  0.0016666667  0.2783333333
1467644 2013-04-12 00:00:15    166             6  0.0016666667  0.2766666667
1467645 2013-04-12 00:00:21    168             6  0.0016666667  0.2800000000
1467646 2013-04-12 00:00:28    166             7  0.0019444444  0.3227777778
1467647 2013-04-12 00:00:34    166             6  0.0016666667  0.2766666667
1467648 2013-04-12 00:00:40    166             6  0.0016666667  0.2766666667
1467649 2013-04-12 00:00:46    166             6  0.0016666667  0.2766666667
1467650 2013-04-12 00:00:52    166             6  0.0016666667  0.2766666667
1467651 2013-04-12 0