In [1]:
# Import Libraries
import os
import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

## Load Data

In [2]:
# Read processed file and print top rows
df = pd.read_csv('..\\..\\data\\processed\\weather\\noaa\\nycHistorical.csv', \
                 index_col='DATE', parse_dates=True)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,TempVals,TempCodes,DewVals,DewCodes,SkyCoverVals,SkyCoverCodes,WindSpeedVals,WindSpeedCodes,StationID,StationLocation
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-01 00:51:00,120,1,40,1,,99,82.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 01:51:00,120,1,50,1,,99,,99,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 02:51:00,120,1,50,1,,99,108.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 03:51:00,120,1,60,1,,99,93.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 04:51:00,110,1,60,1,,99,,99,72505394728,"NY CITY CENTRAL PARK, NY US"


In [3]:
# Scaling of temperature values is requried as per the documentation
df['TempVals'] = df['TempVals'] / 10.0 
df['DewVals'] = df['DewVals'] / 10.0 

# Remove noisy values and print top rows of processed data frame
df = df[(df['TempVals'] < 999) & (df['DewVals'] < 999)].copy()
df.head()

Unnamed: 0_level_0,TempVals,TempCodes,DewVals,DewCodes,SkyCoverVals,SkyCoverCodes,WindSpeedVals,WindSpeedCodes,StationID,StationLocation
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-01 00:51:00,12.0,1,4.0,1,,99,82.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 01:51:00,12.0,1,5.0,1,,99,,99,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 02:51:00,12.0,1,5.0,1,,99,108.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 03:51:00,12.0,1,6.0,1,,99,93.0,1,72505394728,"NY CITY CENTRAL PARK, NY US"
2005-01-01 04:51:00,11.0,1,6.0,1,,99,,99,72505394728,"NY CITY CENTRAL PARK, NY US"


# Data Analytics

In [4]:
# Task: for each month, compute the median temperature (indegC) and dew point values (in degC).
# Why?: 
# As expected, the median temperature follows a bell curve over a calendar year.
# The information is helpful in putting each month in one of the following categories:
# Cold: <5degC; Cool: 5degC-10degC; Warm: 10degC-20degC; Hot: >20degC.
# Accordingly, 
# Jan, Feb, and Dec are Cold months,
# Mar and Nov are Cool months,
# Apr, May, and Oct are Warm months, and
# Jun, Jul, Aug, Sep are the Hot months
dfValidVals = df[['TempVals', 'DewVals']]
dfGrouped = dfValidVals.groupby([dfValidVals.index.year, \
                                 dfValidVals.index.month]).median().rename_axis(index=["Year", \
                                                                                       "Month"]).reset_index()

dfGrouped[["Month", "TempVals", "DewVals"]].groupby("Month").median()

Unnamed: 0_level_0,TempVals,DewVals
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.35,-5.8
2,1.85,-4.2
3,5.0,-3.05
4,11.05,3.0
5,16.0,11.1
6,21.1,16.0
7,24.4,19.0
8,23.6,18.35
9,21.05,14.2
10,14.7,9.4


### Deeper Analysis
1. Time of Day pricing from ConEdison further validates our above-mentioned categorisation. In the hot months (June-September), the time of day price during the day (8AM till midnight) is highest (23.07 cents/kWh) [ref - https://www.coned.com/en/save-money/energy-saving-programs/time-of-use].

2. Why we need this categorisation? - To recommend set point temperature for the thermostat, it is important that we are aware of internal load. One important parameter for internal load is the clothing level which depends on the season. As of now, we are using CBE Thermal Comfort Tool to compute the clothing level [ref-https://comfort.cbe.berkeley.edu/]. For any particular day, they use temperature at 6 AM to predict dynamic clothing level. Based on the median temperature at 6 AM for each month, we used the tool to compute clothing level for each season, which is as follows: 

        Cold (0.78 clo) > Cool (0.63 clo) > Warm (0.56 clo) > Hot (0.5 clo).


3. Another important parameter denoting the internal load is the metabolic activity, basically the heat generated by the occupants due to their activities within the home. We don't want to deploy sensors on the user, or heavily instrument the home. We are okay with an approximation. Our hypothesis is that data from occupancy sensor when combined with meta data (such as type of occupants), can provide us a good estimate of human activity over the day. Once we know the intensity of human activity, we can get an estimate of internal load due to human activities. 
    
4. Since we don't have occupancy sensor installed in the homes, we are using ASHRAE table to compute metabolic activity depending on the type of occupants living in that apartment.

        Married Couple w kids (2.0 met) > Married Couple wo kids (1.7 met) > Single Bachelor (1.3 met) > Old Couple wo kids (1.0 met)
        

5. I do see a challenge even in estimating occupants' activity based on single motion sensor. The questions are: how many occupancy sensors we might need? where shall we install those sensors? can we use additional information from other sensors (such as smart meter) to minimise the number of occupancy sensors and accurately estimate occupants' activity level? We must do a literature survey to do this estimation.  

In [None]:
# Task: 
# For each month, compute median temperature and dew point temperature for every hour of the day
# Why?:  
# Our goal is to understand typical temperature variation across a day for different settings and 
# then recommend set-point, lighting, and appliance settings to the user for every hour of the day. 
# Therefore, here we are doing this computation. This data will be used for the web-tool
dfHourlyVariations = dfValidVals.groupby([dfValidVals.index.month, \
                                          dfValidVals.index.hour]).agg(['mean', \
                                                                        'median', \
                                                                        'std']).rename_axis(index=["Month", \
                                                                                                   "Hour"])
dfHourlyVariations.columns = ['_'.join(col) for col in dfHourlyVariations.columns]
dfHV = dfHourlyVariations.reset_index()

In [None]:
# Task: 
# Compute confidence interval for air temperature and dew point temperature
# Why?:  
# We need this information for data plotting purposes on the web tool
dfHV['TempVals_cileft'] = dfHV['TempVals_mean'] - 1.96 * dfHV['TempVals_std']
dfHV['TempVals_ciright'] = dfHV['TempVals_mean'] + 1.96 * dfHV['TempVals_std']

dfHV['DewVals_cileft'] = dfHV['DewVals_mean'] - 1.96 * dfHV['DewVals_std']
dfHV['DewVals_ciright'] = dfHV['DewVals_mean'] + 1.96 * dfHV['DewVals_std']

dfHV.head()

In [None]:
# Save hourly temperature, dew-point, and the confidence interval data to a CSV
dfHV.to_csv("..\\..\\data\\processed\\weather\\noaa\\nycMonthlyHoD.csv", index=False)

In [None]:
# Task: 
# For every month, compute median air temperature for each hour
# Why?:  
# Generic data exploration
dfValidTemp = df['TempVals']
dfHourlyTemp = dfValidTemp.groupby([dfValidTemp.index.hour, \
                                    dfValidTemp.index.month]).median().rename_axis(index=["Hour", \
                                                                                          "Month"]).unstack()
dfHourlyTemp.head()

In [None]:
# Task: 
# For each season, compute median air temperature for each hour
# Why?:  
# Generic data exploration 
dfHourlyTempVal = pd.concat([dfHourlyTemp[[1, 2, 12]].median(axis=1), \
                      dfHourlyTemp[[3, 11]].median(axis=1), \
                      dfHourlyTemp[[4, 5, 10]].median(axis=1), \
                      dfHourlyTemp[[6, 7, 8, 9]].median(axis=1)], axis=1)
dfHourlyTempVal.columns = ['Cold', 'Cool', 'Warm', 'Hot']
dfHourlyTempVal.head()

In [None]:
# Saving data for hourly median temperature for each season
dfHourlyTempVal.to_csv("..\\..\\data\\processed\\weather\\noaa\\nycHourlyTemp.csv")

In [None]:
# Task: 
# For every month, compute median dew-point temperature for each hour
# Why?:  
# Generic data exploration
dfValidDew = df['DewVals']
dfHourlyDew = dfValidDew.groupby([dfValidDew.index.hour, \
                                  dfValidDew.index.month]).median().rename_axis(index=["Hour", \
                                                                                       "Month"]).unstack()
dfHourlyDew.head()

In [None]:
# Task: 
# For each season, compute median dew-point temperature for each hour
# Why?:  
# Generic data exploration
dfHourlyDewVal = pd.concat([dfHourlyDew[[1, 2, 12]].median(axis=1), \
                            dfHourlyDew[[3, 11]].median(axis=1), \
                            dfHourlyDew[[4, 5, 10]].median(axis=1), \
                            dfHourlyDew[[6, 7, 8, 9]].median(axis=1)], axis=1)
dfHourlyDewVal.columns = ['Cold', 'Cool', 'Warm', 'Hot']
dfHourlyDewVal.head()

In [None]:
# Saving data for hourly median dew-point temperature for each season
dfHourlyDewVal.to_csv("..\\..\\data\\processed\\weather\\noaa\\nycHourlyDew.csv")