# DYD Thermostat Data 

## Preprocess

1. Generated CSV file from queries in BigQueary

2. Data separated into states

3. Aggregated the data in Pandas by month

4. Combine 4 years

5. Group by Identifier



In [1]:
# Dependencies
import pandas as pd
import os
import numpy as np
from pathlib import Path
from datetime import datetime

---
## January

### 2017 January Day

In [2]:
# Read in month csv for state
jan_2017 = pd.read_csv("../data_large/NY-day/2017-jan-day-NY.csv")

# jan_2017

In [3]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,f2d6f5bf42ad3231b4afa0f0f4e66cb0479fdd80,2017-01-08 14:40:00 UTC,heat,auto,688,820,640,NY,brooklyn,110,False,False,False,Gas
1,6fe88e4afccba0e89b2e4620233243177633b15b,2017-01-13 19:40:00 UTC,heat,hold,734,730,730,NY,OZONE PARK,95,False,False,False,Gas
2,98a4ceee9104cf34fc022aa9cfb3b11994dc5acd,2017-01-08 07:20:00 UTC,heat,auto,647,650,650,NY,Montauk,16,False,False,False,Gas
3,aacc1a51eff7e0330fcc646d755cf087efe89ada,2017-01-10 16:55:00 UTC,heat,auto,698,690,690,NY,Southeast,10,False,False,False,Gas
4,809312e9e6805aff2e44b25238c0abfe8b895977,2017-01-26 19:50:00 UTC,heat,hold,648,650,647,NY,Valley Stream,30,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474521,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-01-05 12:05:00 UTC,heat,hold,639,650,650,NY,Irvington,35,False,False,False,Gas
474522,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-01-01 13:30:00 UTC,heat,hold,665,670,670,NY,Irvington,35,False,False,False,Gas
474523,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-01-28 13:20:00 UTC,heat,hold,661,660,660,NY,Irvington,35,False,False,False,Gas
474524,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-01-24 14:05:00 UTC,heat,hold,662,660,660,NY,Irvington,35,False,False,False,Gas


In [4]:
# Add year and month

jan_2017["Year"] = "2017"
jan_2017["Month"] = "Jan"

In [5]:
# Rename columns to label the aggregates

jan_2017 = jan_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [6]:
jan_2017_ave = jan_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

jan_2017_ave

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
Identifier,Month,Year,HvacMode,CalendarEvent,City,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00996606dc340bb60b08b0704486388082dc5b1f,Jan,2017,heat,auto,Buffalo,672.236801,676.035461,675.707250,110.0,False,False,False
00996606dc340bb60b08b0704486388082dc5b1f,Jan,2017,heat,hold,Buffalo,701.757576,707.984848,707.984848,110.0,False,False,False
01451ba531cf9fec46a95a9c67269b0907a6a11b,Jan,2017,heat,auto,Irvington,696.141414,757.813131,698.924242,35.0,False,False,False
01451ba531cf9fec46a95a9c67269b0907a6a11b,Jan,2017,heat,hold,Irvington,705.951049,708.146853,707.363636,35.0,False,False,False
01abbe67487f0559d85464ca7c072ccd0fdd6a54,Jan,2017,heat,auto,New Hartford,690.733333,780.000000,710.000000,55.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
fe215c4eaf2b530973c3e77ba881e61d8661f97c,Jan,2017,heat,hold,Ripley,663.380952,683.444444,683.444444,120.0,True,False,False
fe395721c98e578a1fd308c3a313fd96b36ee10c,Jan,2017,auto,auto,Staten Island,706.494624,768.064516,665.806452,10.0,False,False,False
fe395721c98e578a1fd308c3a313fd96b36ee10c,Jan,2017,auto,hold,Staten Island,704.710145,730.130435,680.130435,10.0,False,False,False
fe395721c98e578a1fd308c3a313fd96b36ee10c,Jan,2017,heat,auto,Staten Island,709.908350,707.340122,669.197556,10.0,False,False,False


In [7]:
# Export CSV file

jan_2017_ave.to_csv("data/day/NY/jan/jan_2017_ave.csv", header=True, index=True)

### 2018 January Day

In [8]:
# Read in month csv for state
jan_2018 = pd.read_csv("../data_large/NY-day/2018-jan-day-NY.csv")

# jan_2018

In [9]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,4f8d4b18797f0188195d346d1a253ed89b34e744,2018-01-04 19:35:00 UTC,heat,hold,779,780,780,NY,Brooklyn,95,False,False,False,Gas
1,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2018-01-25 18:05:00 UTC,heat,hold,702,697,697,NY,Canandaigua,17,False,False,False,Gas
2,1511d6a176bdaeef6b4d3a89c13cab355ad2a08f,2018-01-12 12:05:00 UTC,auxHeatOnly,hold,652,653,653,NY,Yorktown Heights,67,False,False,True,Electric
4,bece9ddd3432fa9044bf13d5ab743e368eb4782f,2018-01-06 15:55:00 UTC,heat,hold,699,705,705,NY,New York,10,False,False,True,Electric
6,d9fca5e8fd5a51d6827b6fefb2236cad12fbedab,2018-01-18 15:10:00 UTC,heat,hold,742,743,743,NY,New York,10,False,False,True,Electric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167731,3480c3fec8cdc0814006da09280d2938eccee8f2,2018-01-05 12:40:00 UTC,heat,hold,757,760,760,NY,OZONE PARK,100,False,False,False,Gas
1167732,4960ffb8ec94ed4dd996fb62068da16e2ca81c70,2018-01-04 17:30:00 UTC,heat,hold,758,760,760,NY,East Fishkill,15,False,False,False,Gas
1167733,4e677e725d6937adbcabc8eae12f65cc0f943578,2018-01-31 12:35:00 UTC,heat,auto,764,760,760,NY,Clifton Park,30,True,False,False,Gas
1167734,1b163b0c4215d5321cdd20f25267d8a046210965,2018-01-02 14:10:00 UTC,heat,auto,574,760,760,NY,Queens,0,False,False,False,Gas


In [10]:
# Add year and month

jan_2018["Year"] = "2018"
jan_2018["Month"] = "Jan"


In [11]:
# Rename columns to label the aggregates

jan_2018 = jan_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [12]:
jan_2018_ave = jan_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2018_ave

In [13]:
# Export CSV file

jan_2018_ave.to_csv("data/day/NY/jan/jan_2018_ave.csv", header=True, index=True)

### 2019 January Day

In [14]:
# Read in month csv for state
jan_2019 = pd.read_csv("../data_large/NY-day/2019-jan-day-NY.csv")

# jan_2019

In [15]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,937ef1884f1e5c7eae3e7034a40c37520c8ff548,2019-01-08 15:35:00 UTC,heat,hold,704,708,708,NY,New York,0,False,False,False,Gas
1,b3d16b3937388b7d9720ad8ac07f0e069ff97970,2019-01-09 12:45:00 UTC,heat,auto,672,673,670,NY,Orchard Park,40,False,False,False,Gas
3,761f3ad0c640cdc06453908cbec5cce6620104ba,2019-01-22 12:20:00 UTC,heat,hold,717,725,725,NY,Rye,9,True,False,False,Gas
4,e8d1742bb9fbd1e70123b0138100ec00c0c1a015,2019-01-28 16:00:00 UTC,heat,hold,683,689,689,NY,Syracuse,98,True,False,False,Gas
5,d26f144aa05b44f73c1122b4aec5bc616df6c3b4,2019-01-02 19:55:00 UTC,heat,hold,720,725,725,NY,Hicksville,70,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1712166,afa1ae0ef1348f1bc51e51353eff38e9b632e2cb,2019-01-27 14:00:00 UTC,auto,hold,717,760,720,NY,Armonk,25,False,False,False,Gas
1712167,a18b0cf00d643dca5627523ee35429a2a222489c,2019-01-14 17:00:00 UTC,heat,hold,679,760,760,NY,Port Jefferson,0,True,False,False,Gas
1712168,662c5e4a6ced6a4448590ea79d9b6e4741416b65,2019-01-28 13:10:00 UTC,heat,auto,722,760,720,NY,Montauk,40,False,False,False,Gas
1712169,a6053132e8c6f402412e2a4dccb10dca2c34e9c8,2019-01-29 15:05:00 UTC,heat,auto,757,760,760,NY,Oceanside,0,False,False,False,Gas


In [16]:
# Add year and month

jan_2019["Year"] = "2019"
jan_2019["Month"] = "Jan"


In [17]:
# Rename columns to label the aggregates

jan_2019 = jan_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [18]:
jan_2019_ave = jan_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2019_ave

In [19]:
# Export CSV file

jan_2019_ave.to_csv("data/day/NY/jan/jan_2019_ave.csv", header=True, index=True)

### 2020 January Day

In [20]:
# Read in month csv for state
jan_2020 = pd.read_csv("../data_large/NY-day/2020-jan-day-NY.csv")

# jan_2020

In [21]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2020-01-23 16:10:00 UTC,heat,hold,716,717,717,NY,Canandaigua,17,False,False,False,Gas
1,378ce7c187c2056313b45dd68fc1652a7f6adeb7,2020-01-05 19:10:00 UTC,heat,hold,674,685,685,NY,Buffalo,40,False,False,False,Gas
2,96d79c7d0c202772d63aa45845feb732911481a9,2020-01-23 18:35:00 UTC,heat,auto,657,662,662,NY,Harlem,5,False,False,False,Gas
4,378ce7c187c2056313b45dd68fc1652a7f6adeb7,2020-01-14 19:55:00 UTC,heat,hold,709,685,685,NY,Buffalo,40,False,False,False,Gas
6,fdc1e969ebe3060c96e0df20804dfa3d37c127d3,2020-01-17 13:30:00 UTC,heat,auto,673,735,641,NY,Brewster,5,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1775172,bb4f937ab5558992b4fe1cff519adfd2dcfaab0d,2020-01-31 13:00:00 UTC,heat,auto,762,760,760,NY,New York,0,False,False,False,Gas
1775173,ecc56bc0099290b2259481e4732fad35662aba03,2020-01-23 12:00:00 UTC,heat,hold,754,760,760,NY,New York,0,True,False,False,Gas
1775174,bb4f937ab5558992b4fe1cff519adfd2dcfaab0d,2020-01-30 14:55:00 UTC,heat,auto,755,760,760,NY,New York,0,False,False,False,Gas
1775175,6a6355128a6081d0a047847d45cefa79b93efa9a,2020-01-15 11:50:00 UTC,heat,auto,753,760,760,NY,Orangeburg,50,False,False,False,Gas


In [22]:
# Add year and month

jan_2020["Year"] = "2020"
jan_2020["Month"] = "Jan"


In [23]:
# Rename columns to label the aggregates

jan_2020 = jan_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [24]:
jan_2020_ave = jan_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2020_ave

In [25]:
# Export CSV file

jan_2020_ave.to_csv("data/day/NY/jan/jan_2020_ave.csv", header=True, index=True)

### 2021 January Day

In [26]:
# Read in month csv for state
jan_2021 = pd.read_csv("../data_large/NY-day/2021-jan-day-NY.csv")

# jan_2021

In [27]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,28af4185c9f7a9f0973b8f3dcab239968019c7e0,2021-01-10 11:30:00 UTC,heat,hold,710,716,716,NY,Buffalo,120,False,False,False,Gas
1,3ff6fb3564432eba06e3fead30ffe1354a919a11,2021-01-11 19:40:00 UTC,heat,hold,785,761,770,NY,Arverne,0,False,False,False,Gas
2,62f9f026a92adbd52fbd6f1f865df77637bbc418,2021-01-01 08:15:00 UTC,cool,hold,698,679,679,NY,East Hills,0,False,False,False,Gas
3,fd0fbd1db1b34f2b16145a0d4b7b30e5159f7bdd,2021-01-15 10:45:00 UTC,heat,hold,714,800,700,NY,New York,0,False,False,False,Gas
4,4cbd6fd57b1eb4e3fa43ffaa55edbdf0181b473c,2021-01-21 17:00:00 UTC,heat,hold,673,689,680,NY,East Syracuse,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108657,3749a00867560be8d5a8ba88b04dcb650719db02,2021-01-16 19:35:00 UTC,heat,hold,759,760,760,NY,Farmingvilli,0,False,False,False,Gas
1108658,52ad7445432e1349a3dc144410080e3b696c9d3c,2021-01-09 16:30:00 UTC,auto,hold,681,760,690,NY,Clifton Park,10,False,False,False,Gas
1108659,0e2cdb98f751cecf959c5f8aad4e183ad01523ec,2021-01-30 17:20:00 UTC,auto,hold,704,760,700,NY,Medford,10,False,False,False,Gas
1108660,b5c2455d936b515f8549ac90073b490f2187a5ec,2021-01-13 19:45:00 UTC,heat,hold,754,760,760,NY,Rockville Centre,20,True,False,False,Gas


In [28]:
# Add year and month

jan_2021["Year"] = "2021"
jan_2021["Month"] = "Jan"


In [29]:
# Rename columns to label the aggregates

jan_2021 = jan_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [30]:
jan_2021_ave = jan_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2021_ave

In [31]:
# Export CSV file

jan_2021_ave.to_csv("data/day/NY/jan/jan_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [32]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/jan/") if f.endswith(".csv")]

# files

In [33]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_jan = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/jan/" + file)
    NY_jan = pd.concat([NY_jan, df])
    
NY_jan

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00996606dc340bb60b08b0704486388082dc5b1f,Jan,2017,heat,auto,Buffalo,672.236801,676.035461,675.707250,110.0,False,False,False
1,00996606dc340bb60b08b0704486388082dc5b1f,Jan,2017,heat,hold,Buffalo,701.757576,707.984848,707.984848,110.0,False,False,False
2,01451ba531cf9fec46a95a9c67269b0907a6a11b,Jan,2017,heat,auto,Irvington,696.141414,757.813131,698.924242,35.0,False,False,False
3,01451ba531cf9fec46a95a9c67269b0907a6a11b,Jan,2017,heat,hold,Irvington,705.951049,708.146853,707.363636,35.0,False,False,False
4,01abbe67487f0559d85464ca7c072ccd0fdd6a54,Jan,2017,heat,auto,New Hartford,690.733333,780.000000,710.000000,55.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,ff7af27aaa6950c6ba52b7a7585516304d996fa9,Jan,2021,heat,hold,Greenfield,697.498804,699.366029,699.370813,40.0,False,False,False
1326,ff848700b24cba3114384af05b08ea5ed914966c,Jan,2021,heat,hold,SYRACUSE,702.731765,705.649412,705.609412,65.0,True,False,False
1327,ffac8c736a7d77c718747fc8eec9a847103d077f,Jan,2021,heat,hold,New York,724.732558,717.381395,717.381395,45.0,False,False,False
1328,ffb2e05ac5e004ce171be4cfdd44498227d90db8,Jan,2021,heat,hold,New City,729.231387,731.960237,731.960237,45.0,False,False,False


In [34]:
NY_jan.to_csv("Scraper_Output/State_Month_Day/NY/NY_jan.csv", header=True, index=False)

---

## February

### 2017 February Day

In [35]:
# Read in month csv for state
feb_2017 = pd.read_csv("../data_large/NY-day/2017-feb-day-NY.csv")

# feb_2017

In [36]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,373d1acc2a138e469c2a080a92ea7e8d71ca5309,2017-02-19 19:45:00 UTC,heat,hold,801,680,680,NY,New York,0,False,False,True,Electric
1,38e7f5d85b86741ca683a5910e0bfe82641e3fbb,2017-02-26 19:00:00 UTC,auto,auto,696,750,700,NY,Staten Island,20,False,False,False,Gas
2,83c728d5aab136a7b2223ce39c3c3b2527d09969,2017-02-20 12:15:00 UTC,heat,auto,675,680,680,NY,Freeville,10,True,False,True,Electric
3,c2c69c7aa26a4f6028da7354b2f6099d4747e8c4,2017-02-06 16:50:00 UTC,heat,hold,650,650,650,NY,Webster,25,False,False,False,Gas
4,4f8d4b18797f0188195d346d1a253ed89b34e744,2017-02-04 13:30:00 UTC,heat,hold,756,760,760,NY,Brooklyn,95,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443764,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-02-01 16:05:00 UTC,heat,hold,661,660,660,NY,Irvington,35,False,False,False,Gas
443765,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-02-22 11:45:00 UTC,heat,hold,636,650,640,NY,Irvington,35,False,False,False,Gas
443766,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-02-07 17:05:00 UTC,heat,hold,664,700,700,NY,Irvington,35,False,False,False,Gas
443767,f9a949bb81e7ccb8053533eb589766ae5180766d,2017-02-08 18:15:00 UTC,heat,hold,667,650,650,NY,Irvington,35,False,False,False,Gas


In [37]:
# Add year and month

feb_2017["Year"] = "2017"
feb_2017["Month"] = "feb"

In [38]:
# Rename columns to label the aggregates

feb_2017 = feb_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [39]:
feb_2017_ave = feb_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2017_ave

In [40]:
# Export CSV file

feb_2017_ave.to_csv("data/day/NY/feb/feb_2017_ave.csv", header=True, index=True)

### 2018 February Day

In [41]:
# Read in month csv for state
feb_2018 = pd.read_csv("../data_large/NY-day/2018-feb-day-NY.csv")

# feb_2018

In [42]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,f390bab7051f2f279cea1420ece35bd1442dddce,2018-02-13 13:50:00 UTC,heat,auto,720,768,705,NY,Webster,5,False,False,False,Gas
1,ecc56bc0099290b2259481e4732fad35662aba03,2018-02-08 18:45:00 UTC,heat,hold,775,780,780,NY,New York,0,True,False,False,Gas
2,bd20773964db1df10eae3e78858ee62d91db6137,2018-02-08 17:25:00 UTC,heat,hold,720,735,735,NY,New York,0,True,False,True,Electric
3,c2d8f69f9aba99fb724899ccfd888385e4fc4e34,2018-02-28 13:10:00 UTC,heat,hold,701,684,684,NY,North Babylon,60,False,False,False,Gas
5,d213a200b264fd505b1bb5b7f0b1b5fdf14a161e,2018-02-16 18:45:00 UTC,heat,hold,676,665,665,NY,New York,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052054,7b6aef864993dc8380a4ec2a3b892dad24495b9b,2018-02-03 12:50:00 UTC,heat,hold,759,760,760,NY,New York,70,False,False,False,Gas
1052055,6bd334447c20aeec019b5dda12c2cd88727bdf16,2018-02-07 17:35:00 UTC,heat,hold,726,760,760,NY,Brooklyn,95,False,False,False,Gas
1052056,4a29ab43010b65122a233694fae98d2b39c28cf7,2018-02-10 17:50:00 UTC,heat,hold,759,760,760,NY,Valley Stream,87,False,False,False,Gas
1052057,4960ffb8ec94ed4dd996fb62068da16e2ca81c70,2018-02-06 15:20:00 UTC,heat,hold,757,760,760,NY,East Fishkill,15,False,False,False,Gas


In [43]:
# Add year and month

feb_2018["Year"] = "2018"
feb_2018["Month"] = "feb"


In [44]:
# Rename columns to label the aggregates

feb_2018 = feb_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [45]:
feb_2018_ave = feb_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2018_ave

In [46]:
# Export CSV file

feb_2018_ave.to_csv("data/day/NY/feb/feb_2018_ave.csv", header=True, index=True)

### 2019 February Day

In [47]:
# Read in month csv for state
feb_2019 = pd.read_csv("../data_large/NY-day/2019-feb-day-NY.csv")

# feb_2019

In [48]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,0ad5e280dc8ec647e0ebafbcc88b0a1d8684bc9d,2019-02-02 19:15:00 UTC,heat,auto,594,610,610,NY,Athens,20,True,False,True,Electric
3,a945c091d5d04a7577e15c6a1ec4b096af1184c2,2019-02-09 18:30:00 UTC,heat,auto,719,765,720,NY,Pittsford,5,False,False,False,Gas
4,6bd334447c20aeec019b5dda12c2cd88727bdf16,2019-02-27 15:20:00 UTC,heat,auto,698,840,650,NY,Brooklyn,95,False,False,False,Gas
5,84ddb3b84c8a6d938c369d7c0cd9fe7350e20724,2019-02-27 15:00:00 UTC,heat,hold,688,678,678,NY,Orchard Park,60,False,False,False,Gas
6,4f368c761421ba02782e2ed5c38d0b484e15819b,2019-02-23 15:30:00 UTC,heat,hold,746,745,745,NY,holtsville,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189371,58441970c3828396192ae03dd5e25ec27ec9b534,2019-02-17 12:20:00 UTC,heat,auto,619,760,630,NY,Pittsford,45,False,False,False,Gas
1189372,a5513966329b8bd03e7892d35a20de78f09f3e9d,2019-02-18 17:55:00 UTC,heat,hold,760,760,760,NY,Port Jefferson,120,False,False,False,Gas
1189373,3fc156aec8dc86267d78f812576ef8d289bf7d2b,2019-02-04 15:40:00 UTC,auto,hold,693,760,700,NY,Medford,10,False,False,False,Gas
1189374,cdd3557e5b33327dfa25cde860566962fc259a45,2019-02-28 19:10:00 UTC,auto,hold,696,760,700,NY,Pittsford,38,False,False,False,Gas


In [49]:
# Add year and month

feb_2019["Year"] = "2019"
feb_2019["Month"] = "feb"


In [50]:
# Rename columns to label the aggregates

feb_2019 = feb_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [51]:
feb_2019_ave = feb_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2019_ave

In [52]:
# Export CSV file

feb_2019_ave.to_csv("data/day/NY/feb/feb_2019_ave.csv", header=True, index=True)

### 2020 February Day

In [53]:
# Read in month csv for state
feb_2020 = pd.read_csv("../data_large/NY-day/2020-feb-day-NY.csv")

# feb_2020

In [54]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,06878ebc6479aa003be1e7868bfa8f6272448662,2020-02-05 13:40:00 UTC,heat,hold,729,729,729,NY,East Hills,0,False,False,False,Gas
1,a620bc6c2f2873b67f3da539f783a97333b87e21,2020-02-17 12:50:00 UTC,heat,hold,633,662,630,NY,Southampton,50,False,False,False,Gas
2,c9f0e9631890d1b0593f44480d16bfb445d17670,2020-02-26 16:05:00 UTC,heat,hold,644,640,640,NY,East Rochester,60,False,False,False,Gas
3,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2020-02-27 11:50:00 UTC,heat,hold,694,717,717,NY,Canandaigua,17,False,False,False,Gas
4,de1b5b35af5d3391bb5d0abd6cb47b2f27f80c55,2020-02-08 11:55:00 UTC,auto,hold,705,830,710,NY,Armonk,26,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1601777,292869d03577168770811bd3caa7d597e6dbb4c5,2020-02-24 14:15:00 UTC,heat,hold,756,760,760,NY,Depew,60,False,False,False,Gas
1601778,cad4a255ea99121256ec2be08cc2c9ca567f6ba2,2020-02-12 14:45:00 UTC,heat,hold,752,760,760,NY,Buffalo,95,False,False,False,Gas
1601779,2d5bcf13f17796c08a9aa423a0344a3e74136c72,2020-02-15 13:30:00 UTC,heat,auto,761,760,760,NY,Staten Island,20,False,False,False,Gas
1601780,9ce3c880c38d6ae737a3b7dc8c57d663e5a24a3b,2020-02-11 13:50:00 UTC,auto,hold,698,760,700,NY,Somers,0,True,False,False,Gas


In [55]:
# Add year and month

feb_2020["Year"] = "2020"
feb_2020["Month"] = "feb"


In [56]:
# Rename columns to label the aggregates

feb_2020 = feb_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [57]:
feb_2020_ave = feb_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2020_ave

In [58]:
# Export CSV file

feb_2020_ave.to_csv("data/day/NY/feb/feb_2020_ave.csv", header=True, index=True)

### 2021 February Day

In [59]:
# Read in month csv for state
feb_2021 = pd.read_csv("../data_large/NY-day/2021-feb-day-NY.csv")

# feb_2021

In [60]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,657ed79c75c8b0cb0c3c0e9feb73569415e5ff17,2021-02-23 17:30:00 UTC,heat,hold,694,704,695,NY,Ithaca,0,False,False,False,Gas
1,99f6d460d6506fe889f469e865173c598028eff6,2021-02-17 18:35:00 UTC,auto,hold,695,747,697,NY,Painted Post,25,False,False,False,Gas
2,ee010cfeb5a56af0ac0bd8494efe2b73761eb2c8,2021-02-23 18:00:00 UTC,auto,hold,702,755,705,NY,Williamsville,90,False,False,False,Gas
4,ee5ed483db25bbe5b14506dd2932c01de49d1210,2021-02-06 12:05:00 UTC,heat,hold,675,683,683,NY,Tonawanda town,90,False,False,False,Gas
6,5cd5d57471ac4c60151493e06e6e55eaf96dd099,2021-02-19 15:35:00 UTC,auto,hold,690,745,695,NY,New York,0,False,False,True,Electric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001437,3ff6fb3564432eba06e3fead30ffe1354a919a11,2021-02-02 12:45:00 UTC,heat,hold,799,761,800,NY,Arverne,0,False,False,False,Gas
1001438,3ff6fb3564432eba06e3fead30ffe1354a919a11,2021-02-01 17:25:00 UTC,heat,hold,794,761,800,NY,Arverne,0,False,False,False,Gas
1001439,3ff6fb3564432eba06e3fead30ffe1354a919a11,2021-02-11 17:20:00 UTC,heat,hold,776,761,780,NY,Arverne,0,False,False,False,Gas
1001440,3ff6fb3564432eba06e3fead30ffe1354a919a11,2021-02-02 19:40:00 UTC,heat,hold,796,761,800,NY,Arverne,0,False,False,False,Gas


In [61]:
# Add year and month

feb_2021["Year"] = "2021"
feb_2021["Month"] = "feb"


In [62]:
# Rename columns to label the aggregates

feb_2021 = feb_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [63]:
feb_2021_ave = feb_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2021_ave

In [64]:
# Export CSV file

feb_2021_ave.to_csv("data/day/NY/feb/feb_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [65]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/feb/") if f.endswith(".csv")]

# files

In [66]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_feb = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/feb/" + file)
    NY_feb = pd.concat([NY_feb, df])
    
NY_feb

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00996606dc340bb60b08b0704486388082dc5b1f,feb,2017,heat,auto,Buffalo,681.175905,684.681631,684.658268,110.0,False,False,False
1,00996606dc340bb60b08b0704486388082dc5b1f,feb,2017,heat,hold,Buffalo,689.649007,699.940397,699.940397,110.0,False,False,False
2,01451ba531cf9fec46a95a9c67269b0907a6a11b,feb,2017,heat,auto,Irvington,682.307692,691.307692,686.076923,35.0,False,False,False
3,01451ba531cf9fec46a95a9c67269b0907a6a11b,feb,2017,heat,hold,Irvington,698.442029,698.695652,698.695652,35.0,False,False,False
4,01c8d408d81af596e2f83117506bc83b17b16cf8,feb,2017,heat,auto,New York,675.512821,739.076923,660.615385,9.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1272,ff7af27aaa6950c6ba52b7a7585516304d996fa9,feb,2021,heat,hold,Greenfield,677.837037,680.000000,680.000000,40.0,False,False,False
1273,ff848700b24cba3114384af05b08ea5ed914966c,feb,2021,heat,hold,SYRACUSE,716.053659,719.229268,719.229268,65.0,True,False,False
1274,ffac8c736a7d77c718747fc8eec9a847103d077f,feb,2021,heat,hold,New York,725.561834,722.797441,722.797441,45.0,False,False,False
1275,ffb2e05ac5e004ce171be4cfdd44498227d90db8,feb,2021,heat,hold,New City,723.624158,725.992003,725.992003,45.0,False,False,False


In [67]:
NY_feb.to_csv("Scraper_Output/State_Month_Day/NY/NY_feb.csv", header=True, index=False)

---

## March

---

## April

---

## May

---

## June

### 2017 June Day

In [68]:
# Read in month csv for state
jun_2017 = pd.read_csv("../data_large/NY-day/2017-jun-day-NY.csv")

# jun_2017

In [69]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,f1f0df39403259e64b88315ee21bfe28f8f41dc1,2017-06-21 16:05:00 UTC,cool,hold,677,670,670,NY,East Amherst,105,False,False,False,Gas
1,c3c51ef0c06be59a80bcddfe073ed3860017873d,2017-06-12 17:40:00 UTC,cool,auto,764,770,630,NY,Bellmore,60,False,False,False,Gas
2,435f83dfd3be3f74b91edcc4210b9b64d93e5069,2017-06-22 11:25:00 UTC,cool,hold,712,730,730,NY,Lima,5,False,False,False,Gas
3,3fba3c2c7187e9037a1f2300662db5893509568a,2017-06-28 15:55:00 UTC,cool,hold,695,840,760,NY,Nyack,35,True,False,False,Gas
4,1f903a0029f4fd4561f565921570492157b1723f,2017-06-07 17:55:00 UTC,heat,hold,675,644,644,NY,Callicoon,25,False,True,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548069,fcaf75919fa3657a332dc6ef4b815f40d99a954b,2017-06-24 14:00:00 UTC,auto,hold,717,730,650,NY,Sea Cliff,86,False,False,False,Gas
548070,fcaf75919fa3657a332dc6ef4b815f40d99a954b,2017-06-20 12:25:00 UTC,auto,hold,716,720,650,NY,Sea Cliff,86,False,False,False,Gas
548071,fcaf75919fa3657a332dc6ef4b815f40d99a954b,2017-06-05 19:35:00 UTC,auto,auto,690,770,690,NY,Sea Cliff,86,False,False,False,Gas
548072,fcaf75919fa3657a332dc6ef4b815f40d99a954b,2017-06-07 17:45:00 UTC,auto,auto,688,770,690,NY,Sea Cliff,86,False,False,False,Gas


In [70]:
# Add year and month

jun_2017["Year"] = "2017"
jun_2017["Month"] = "jun"

In [71]:
# Rename columns to label the aggregates

jun_2017 = jun_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [72]:
jun_2017_ave = jun_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2017_ave

In [73]:
# Export CSV file

jun_2017_ave.to_csv("data/day/NY/jun/jun_2017_ave.csv", header=True, index=True)

### 2018 June Day

In [74]:
# Read in month csv for state
jun_2018 = pd.read_csv("../data_large/NY-day/2018-jun-day-NY.csv")

# jun_2018

In [75]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
2,bece9ddd3432fa9044bf13d5ab743e368eb4782f,2018-06-13 17:15:00 UTC,cool,hold,679,685,685,NY,New York,10,False,False,True,Electric
3,2113285a5c59c3bf0bdc96a2959bb9c8245e1ebb,2018-06-19 13:15:00 UTC,cool,hold,702,702,702,NY,Lancaster,60,False,False,False,Gas
5,18dc3da6f47966966b6b8014053acc400e4ee6c5,2018-06-23 12:15:00 UTC,cool,hold,651,648,648,NY,Newburgh,10,True,False,False,Gas
7,1dcfaae8bfe47dc928d0722207b66e3c949c5a1d,2018-06-18 15:20:00 UTC,cool,hold,687,685,685,NY,Oriskany,5,False,False,False,Gas
8,10c6ac3bfd2c0439980b95696f2e83ecdf31b57d,2018-06-19 12:20:00 UTC,auto,hold,734,746,696,NY,Webster,20,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124707,04360333b73c128c1a29640bd4e5a7028984c23a,2018-06-29 19:05:00 UTC,cool,hold,760,760,760,NY,Honeoye Falls,60,False,False,False,Gas
1124708,b76063cf48a4262235cc540e6205b6d3a7cd1faa,2018-06-20 16:25:00 UTC,cool,auto,758,760,760,NY,Garden City,0,False,False,False,Gas
1124709,ac3a1b1742358954abc5df54871a5d01cae45c51,2018-06-27 13:30:00 UTC,cool,hold,732,760,760,NY,Malverne,70,False,False,False,Gas
1124710,7f0bf7fe4899657092bc504448d16d53fba8a426,2018-06-16 13:35:00 UTC,cool,hold,768,800,760,NY,Hicksville,67,False,False,False,Gas


In [76]:
# Add year and month

jun_2018["Year"] = "2018"
jun_2018["Month"] = "jun"

In [77]:
# Rename columns to label the aggregates

jun_2018 = jun_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [78]:
jun_2018_ave = jun_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2018_ave

In [79]:
# Export CSV file

jun_2018_ave.to_csv("data/day/NY/jun/jun_2018_ave.csv", header=True, index=True)

### 2019 June Day

In [80]:
# Read in month csv for state
jun_2019 = pd.read_csv("../data_large/NY-day/2019-jun-day-NY.csv")

# jun_2019

In [81]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
4,87f53e52444bc97bcca5dd52dedc3b7b3b38d2a4,2019-06-26 15:35:00 UTC,cool,hold,725,725,725,NY,New York,0,True,False,False,Gas
5,ad41e5437e5c108e41318763eada3a9ee045b6da,2019-06-08 12:55:00 UTC,cool,hold,719,735,735,NY,Pleasantville,60,False,False,False,Gas
6,ad41e5437e5c108e41318763eada3a9ee045b6da,2019-06-08 10:50:00 UTC,cool,hold,717,735,735,NY,Pleasantville,60,False,False,False,Gas
7,14848bcd916ba50ea4f18e5276d223308e3e6ec7,2019-06-01 17:50:00 UTC,auto,hold,737,783,733,NY,Bayport,8,False,False,False,Gas
9,9152101e2a2824ee4f38e0d2e2a4d5b90f787842,2019-06-08 10:15:00 UTC,cool,hold,680,662,662,NY,Poughkeepsie,19,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639535,f11eaf5bea6f736c812e33b1aa30dfc359a328db,2019-06-25 13:30:00 UTC,cool,auto,733,760,760,NY,Southeast,10,False,False,False,Gas
1639536,e94fb9eb380b724ce9bacb508fc2be80e4177cf5,2019-06-05 13:10:00 UTC,cool,hold,733,760,760,NY,Pearl River,110,False,False,False,Gas
1639537,865cdaa68a82dad8a0e20b8c5fa5e55bd1cd8aef,2019-06-25 16:30:00 UTC,cool,hold,751,760,760,NY,Getzville,39,False,False,False,Gas
1639538,1ccea1c76351aa558b22b9bfb0a11dc130d0e96a,2019-06-21 09:15:00 UTC,cool,hold,727,760,760,NY,Penfield,0,False,False,False,Gas


In [82]:
# Add year and month

jun_2019["Year"] = "2019"
jun_2019["Month"] = "jun"

In [83]:
# Rename columns to label the aggregates

jun_2019 = jun_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [84]:
jun_2019_ave = jun_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2019_ave

In [85]:
# Export CSV file

jun_2019_ave.to_csv("data/day/NY/jun/jun_2019_ave.csv", header=True, index=True)

### 2020 June Day

In [86]:
# Read in month csv for state
jun_2020 = pd.read_csv("../data_large/NY-day/2020-jun-day-NY.csv")

# jun_2020

In [87]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,95d4a4ad6cbe89f736204b5d9586026c85f789ec,2020-06-04 16:45:00 UTC,cool,hold,738,737,667,NY,Massapequa,60,False,False,False,Gas
2,2cf050dddd05fa8a90076aaafb5081569ed7f68d,2020-06-21 08:35:00 UTC,cool,hold,688,673,673,NY,Holtsville,60,True,False,False,Gas
6,94a4930a2a0f74162fc46a899ebc4ccb3c89101b,2020-06-18 10:55:00 UTC,cool,hold,683,685,685,NY,Huntington,60,True,False,False,Gas
9,5532b9ff297f78d9e990533a1af63783b9bf35f1,2020-06-26 14:45:00 UTC,cool,hold,770,780,751,NY,Middletown,0,False,False,False,Gas
10,8df4bbc1e4c14743c23461728086bf4216b4eb69,2020-06-20 14:10:00 UTC,auto,hold,711,736,656,NY,Webster,30,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1583811,0e4c18e0c4451bdff06a4600155f530e216d4a2c,2020-06-24 14:20:00 UTC,cool,auto,739,760,760,NY,Amherst,35,False,False,False,Gas
1583812,47b85732faffdd4f2835d503e5ed320ae2560a39,2020-06-20 17:40:00 UTC,cool,hold,745,760,760,NY,New York,30,False,False,False,Gas
1583813,8d906a9116f463bc31a3cbeab64b8415b8f8aefa,2020-06-30 12:00:00 UTC,cool,auto,743,770,760,NY,Sayville,57,False,False,False,Gas
1583814,d213a200b264fd505b1bb5b7f0b1b5fdf14a161e,2020-06-15 12:30:00 UTC,cool,hold,751,760,760,NY,New York,0,False,False,False,Gas


In [88]:
# Add year and month

jun_2020["Year"] = "2020"
jun_2020["Month"] = "jun"

In [89]:
# Rename columns to label the aggregates

jun_2020 = jun_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [90]:
jun_2020_ave = jun_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2020_ave

In [91]:
# Export CSV file

jun_2020_ave.to_csv("data/day/NY/jun/jun_2020_ave.csv", header=True, index=True)

### 2021 June Day

In [92]:
# Read in month csv for state
jun_2021 = pd.read_csv("../data_large/NY-day/2021-jun-day-NY.csv")

# jun_2021

In [93]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,247fb81848926a2e4ff4b72404271889f04b5660,2021-06-17 17:35:00 UTC,cool,hold,731,727,727,NY,Brooklyn,95,False,False,False,Gas
1,0867b97a523dbc48687877f6ae73fc24bb1e110b,2021-06-04 19:55:00 UTC,auto,hold,729,732,672,NY,Hauppauge,50,False,False,False,Gas
2,f5ba6b990604979c6c6060809c585bbbbdc3e92e,2021-06-11 13:45:00 UTC,heat,hold,746,650,610,NY,Long Beach,80,False,False,False,Gas
3,06878ebc6479aa003be1e7868bfa8f6272448662,2021-06-04 18:20:00 UTC,heat,hold,752,709,709,NY,East Hills,0,False,False,False,Gas
4,ea0b115c38bf0ddd18c38cb7a2c7af8d0c21758e,2021-06-13 17:45:00 UTC,cool,hold,712,712,712,NY,Brooklyn,9,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940781,1d2d4f82f2bda26ce1ccd70ee5e3a44bfc27fc9a,2021-06-12 13:40:00 UTC,cool,hold,707,760,760,NY,Ballston,0,False,False,False,Gas
940782,fa90a1dfbd9f83019c8e902bac0087e232b08c0e,2021-06-02 19:50:00 UTC,cool,hold,763,760,760,NY,HIGHLAND MILLS,15,False,False,False,Gas
940783,11bf217eb3f0a66acdef6ed1491bef18088c5c15,2021-06-15 18:05:00 UTC,cool,hold,754,760,760,NY,New York,56,False,False,True,Electric
940784,a17e2189508ea979c86bc112ec60b459d8e02982,2021-06-19 13:05:00 UTC,cool,hold,741,760,760,NY,Clifton Park,0,False,False,False,Gas


In [94]:
# Add year and month

jun_2021["Year"] = "2021"
jun_2021["Month"] = "jun"

In [95]:
# Rename columns to label the aggregates

jun_2021 = jun_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [96]:
jun_2021_ave = jun_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2021_ave

In [97]:
# Export CSV file

jun_2021_ave.to_csv("data/day/NY/jun/jun_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [98]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/jun/") if f.endswith(".csv")]

# files

In [99]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_jun = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/jun/" + file)
    NY_jun = pd.concat([NY_jun, df])
    
NY_jun

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00ae124983451237520586ba85955c62dce37df5,jun,2017,cool,hold,Massapequa,734.595069,734.811116,734.811116,60.0,False,False,False
1,0121a65bb6b5c42b127417008cd2f09a34da5f75,jun,2017,cool,auto,Malverne,754.425926,769.162037,666.763889,70.0,False,False,False
2,0121a65bb6b5c42b127417008cd2f09a34da5f75,jun,2017,cool,hold,Malverne,741.485222,742.336207,741.858374,70.0,False,False,False
3,01451ba531cf9fec46a95a9c67269b0907a6a11b,jun,2017,auto,auto,Irvington,753.937500,760.000000,700.000000,35.0,False,False,False
4,01451ba531cf9fec46a95a9c67269b0907a6a11b,jun,2017,auto,hold,Irvington,756.756098,774.146341,700.000000,35.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,ff29a4f110b99c56bc7bb98688f62a60dc2d6382,jun,2021,cool,hold,Oyster Bay,734.713217,728.241895,728.241895,60.0,False,False,False
1306,ff7af27aaa6950c6ba52b7a7585516304d996fa9,jun,2021,cool,hold,Greenfield,709.322344,712.897436,712.893773,40.0,False,False,False
1307,ffac8c736a7d77c718747fc8eec9a847103d077f,jun,2021,auto,hold,New York,734.613119,738.661312,652.597055,45.0,False,False,False
1308,ffb2e05ac5e004ce171be4cfdd44498227d90db8,jun,2021,cool,hold,New City,743.166667,740.758961,740.738351,45.0,False,False,False


In [100]:
NY_jun.to_csv("Scraper_Output/State_Month_Day/NY/NY_jun.csv", header=True, index=False)

---

## July

### 2017 July Day

In [101]:
# Read in month csv for state
jul_2017 = pd.read_csv("../data_large/NY-day/2017-jul-day-NY.csv")

# jul_2017

In [102]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,10197f66ec5e779163a9b553a309ffbf1aa73e1b,2017-07-31 19:35:00 UTC,cool,hold,776,770,770,NY,East Rockaway,66,False,False,False,Gas
1,fd1ca385f111ab9f310cf27ae54f7fcce605f052,2017-07-23 16:40:00 UTC,cool,hold,719,720,720,NY,Brewster,15,False,False,False,Gas
2,a270e58ceb8086e80b16152f9830c590d70c1e0a,2017-07-12 19:40:00 UTC,heat,auto,730,650,610,NY,Broad Channel,17,False,False,False,Gas
3,45a42076122ec5f8b0a5494f702a75d34a3e2644,2017-07-08 14:40:00 UTC,cool,hold,702,710,710,NY,Wesley Hills,0,False,False,False,Gas
4,5218e50a35ea593bc2daa7fe311c774ff1fb2367,2017-07-28 19:35:00 UTC,cool,hold,732,730,730,NY,Wheatfield,30,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682395,b49a831bee29105aba0745704e50e97da861503a,2017-07-29 13:55:00 UTC,cool,auto,724,720,730,NY,Great Neck Estates,60,False,False,False,Gas
682396,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-07-19 17:35:00 UTC,cool,hold,746,740,740,NY,Great Neck Estates,50,False,False,False,Gas
682397,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-07-20 13:10:00 UTC,cool,hold,738,740,740,NY,Great Neck Estates,50,False,False,False,Gas
682398,b49a831bee29105aba0745704e50e97da861503a,2017-07-02 10:55:00 UTC,cool,auto,752,750,730,NY,Great Neck Estates,60,False,False,False,Gas


In [103]:
# Add year and month

jul_2017["Year"] = "2017"
jul_2017["Month"] = "jul"

In [104]:
# Rename columns to label the aggregates

jul_2017 = jul_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [105]:
jul_2017_ave = jul_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2017_ave

In [106]:
# Export CSV file

jul_2017_ave.to_csv("data/day/NY/jul/jul_2017_ave.csv", header=True, index=True)

### 2018 July Day

In [107]:
# Read in month csv for state
jul_2018 = pd.read_csv("../data_large/NY-day/2018-jul-day-NY.csv")

# jul_2018

In [108]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,f9a949bb81e7ccb8053533eb589766ae5180766d,2018-07-01 17:10:00 UTC,cool,auto,727,688,688,NY,Irvington,35,False,False,False,Gas
1,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2018-07-06 11:15:00 UTC,cool,hold,685,677,677,NY,Canandaigua,17,False,False,False,Gas
2,a8dc7c48d2b690d67c45f66e5af9a9318ea3ca14,2018-07-10 11:40:00 UTC,cool,hold,721,715,715,NY,Brooklyn,85,False,False,False,Gas
5,d145b2566bdc72a94d22814ed35889453bd24763,2018-07-11 14:20:00 UTC,cool,auto,741,743,698,NY,New York,0,False,False,False,Gas
6,09db88ceba925a9147d5aacaff26854472cd4c04,2018-07-08 13:55:00 UTC,auto,hold,723,735,665,NY,East Meadow,60,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397089,c8a0bff5e17cbe827e0edc4f1521cd4a26f34f6f,2018-07-21 19:35:00 UTC,cool,hold,777,760,760,NY,Henrietta,50,False,False,False,Gas
1397090,f390bab7051f2f279cea1420ece35bd1442dddce,2018-07-02 19:25:00 UTC,cool,auto,767,760,760,NY,Webster,5,False,False,False,Gas
1397091,fde075ccd6113780617c0313fb78960ca3c8a5d2,2018-07-12 11:40:00 UTC,cool,hold,753,760,760,NY,OZONE PARK,100,False,False,False,Gas
1397092,a55fa987fb6462abcecaec87f4c110ff4728b2ab,2018-07-22 14:45:00 UTC,cool,hold,749,760,760,NY,Pelham Manor,90,False,False,False,Gas


In [109]:
# Add year and month

jul_2018["Year"] = "2018"
jul_2018["Month"] = "jul"

In [110]:
# Rename columns to label the aggregates

jul_2018 = jul_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [111]:
jul_2018_ave = jul_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2018_ave

In [112]:
# Export CSV file

jul_2018_ave.to_csv("data/day/NY/jul/jul_2018_ave.csv", header=True, index=True)

### 2019 July Day

In [113]:
# Read in month csv for state
jul_2019 = pd.read_csv("../data_large/NY-day/2019-jul-day-NY.csv")

# jul_2019

In [114]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,91202bb9f83672d79a85216a718c73d74c62914e,2019-07-28 11:40:00 UTC,cool,hold,711,705,705,NY,White Plains,117,False,False,False,Gas
1,21dfe7a4e7ad33a5540d358a6104eb842668fa68,2019-07-20 16:15:00 UTC,cool,hold,744,745,745,NY,Poughkeepsie,80,True,False,False,Gas
2,94f6a9aa76a9eeaed333f81dee9c406ad394c5c0,2019-07-15 15:10:00 UTC,auto,hold,705,697,647,NY,Hamburg,45,False,False,False,Gas
3,1cdb58aa5759908ea913bff159d4b361522d7df3,2019-07-12 15:45:00 UTC,cool,hold,781,800,800,NY,Kings Park,9,True,False,False,Gas
4,311a480765a6e7c175b453eba0cb689675d10029,2019-07-15 18:05:00 UTC,cool,hold,779,800,800,NY,Nanuet,39,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1929770,7031ee85b607cce0615ce4079f1a9e8191667470,2019-07-01 15:25:00 UTC,cool,auto,747,760,760,NY,Bronx,110,False,False,False,Gas
1929771,1ccea1c76351aa558b22b9bfb0a11dc130d0e96a,2019-07-05 19:55:00 UTC,cool,hold,741,760,760,NY,Penfield,0,False,False,False,Gas
1929772,1ccea1c76351aa558b22b9bfb0a11dc130d0e96a,2019-07-30 11:50:00 UTC,cool,auto,739,760,760,NY,Penfield,0,False,False,False,Gas
1929773,e0ce49e4690f26814e1821e1b271d47b410256cb,2019-07-07 15:55:00 UTC,cool,hold,759,760,760,NY,Saint James,5,False,False,False,Gas


In [115]:
# Add year and month

jul_2019["Year"] = "2019"
jul_2019["Month"] = "jul"

In [116]:
# Rename columns to label the aggregates

jul_2019 = jul_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [117]:
jul_2019_ave = jul_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2019_ave

In [118]:
# Export CSV file

jul_2019_ave.to_csv("data/day/NY/jul/jul_2019_ave.csv", header=True, index=True)

### 2020 July Day

In [119]:
# Read in month csv for state
jul_2020 = pd.read_csv("../data_large/NY-day/2020-jul-day-NY.csv")

# jul_2020

In [120]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
2,22dc42fe18708d3449e0b8c18b90cb9373c576e8,2020-07-06 16:20:00 UTC,cool,hold,770,761,761,NY,Brooklyn,0,False,False,False,Gas
3,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2020-07-09 17:55:00 UTC,cool,hold,715,707,707,NY,Canandaigua,17,False,False,False,Gas
5,7f044639b7e17b7878a1477f166390d7375396a4,2020-07-23 13:20:00 UTC,cool,hold,698,702,702,NY,Phoenix,10,True,False,False,Gas
6,27f7f9a5b5b39c4ebde0c7afe038775447facdae,2020-07-31 15:25:00 UTC,cool,hold,684,685,685,NY,Chappaqua,40,False,False,False,Gas
7,65bd903c7fb276aa5a8aaf68ee5da94e309e27c6,2020-07-03 15:40:00 UTC,cool,hold,749,747,747,NY,Southampton,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796091,6a9da107d5c8f21074559c8b7e4f19e924eebd06,2020-07-09 16:10:00 UTC,cool,hold,761,760,760,NY,New York,0,False,False,False,Gas
1796092,6d80824e9f06fd013934ceb0f7f8ff92e75e3c80,2020-07-18 18:15:00 UTC,cool,hold,764,760,760,NY,Staten Island,45,False,False,False,Gas
1796093,4ae8065fc307cd07c8ea8e66a0f2ef3e78ecd2e0,2020-07-28 12:30:00 UTC,cool,hold,749,760,760,NY,Coram,0,False,False,False,Gas
1796094,c85d83876a0e043ece8f5a55b461b80ca8d19a49,2020-07-19 15:10:00 UTC,cool,auto,755,750,760,NY,Erwins,20,False,False,False,Gas


In [121]:
# Add year and month

jul_2020["Year"] = "2020"
jul_2020["Month"] = "jul"

In [122]:
# Rename columns to label the aggregates

jul_2020 = jul_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [123]:
jul_2020_ave = jul_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2020_ave

In [124]:
# Export CSV file

jul_2020_ave.to_csv("data/day/NY/jul/jul_2020_ave.csv", header=True, index=True)

### 2021 July Day

In [125]:
# Read in month csv for state
jul_2021 = pd.read_csv("../data_large/NY-day/2021-jul-day-NY.csv")

# jul_2021

In [126]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)

jul_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,9101459165a5a476eaaa3a1b1f7a7fda85ec91ad,2021-07-27 11:40:00 UTC,cool,hold,687,687,687,NY,Ballston Spa,30,False,False,False,Gas
2,ea2d0471f9ae2db820f2d542a3b7c1aaf3a1e447,2021-07-28 15:55:00 UTC,cool,hold,717,711,711,NY,Riverhead,30,True,False,True,Electric
3,761f3ad0c640cdc06453908cbec5cce6620104ba,2021-07-23 11:40:00 UTC,cool,hold,676,672,672,NY,Rye,9,True,False,False,Gas
4,18b6a0ec3bf47c6ed08336c43b71d125c0c3fb4b,2021-07-19 15:05:00 UTC,cool,hold,690,697,697,NY,Canandaigua,17,False,False,False,Gas
5,b4140f7bafdc1e9747ac605835458429b602f87d,2021-07-09 13:35:00 UTC,cool,hold,746,752,752,NY,Riverhead,30,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923963,032cfa5b452b3ca114afd0a7de1345d108386231,2021-07-14 12:30:00 UTC,cool,hold,754,760,760,NY,Lysander,25,False,False,False,Gas
923964,83161fafa8d3f2d3540617e6855fbb7c61539ae9,2021-07-21 17:25:00 UTC,cool,hold,738,760,760,NY,Webster,20,False,False,False,Gas
923965,032cfa5b452b3ca114afd0a7de1345d108386231,2021-07-13 13:25:00 UTC,cool,hold,745,760,760,NY,Lysander,25,False,False,False,Gas
923966,04125e365a8d55d18bf39ed4676c60f1746549bf,2021-07-06 18:40:00 UTC,cool,hold,751,760,760,NY,Schenectady,25,True,False,False,Gas


In [127]:
# Add year and month

jul_2021["Year"] = "2021"
jul_2021["Month"] = "jul"

In [128]:
# Rename columns to label the aggregates

jul_2021 = jul_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [129]:
jul_2021_ave = jul_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2021_ave

In [130]:
# Export CSV file

jul_2021_ave.to_csv("data/day/NY/jul/jul_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [131]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/jul/") if f.endswith(".csv")]

# files

In [132]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_jul = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/jul/" + file)
    NY_jul = pd.concat([NY_jul, df])
    
NY_jul

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00ae124983451237520586ba85955c62dce37df5,jul,2017,cool,hold,Massapequa,730.254247,729.299830,729.299830,60.0,False,False,False
1,0121a65bb6b5c42b127417008cd2f09a34da5f75,jul,2017,auto,hold,Malverne,751.500000,787.500000,770.000000,70.0,False,False,False
2,0121a65bb6b5c42b127417008cd2f09a34da5f75,jul,2017,cool,auto,Malverne,758.343202,770.630482,660.428728,70.0,False,False,False
3,0121a65bb6b5c42b127417008cd2f09a34da5f75,jul,2017,cool,hold,Malverne,745.381694,752.888997,752.629990,70.0,False,False,False
4,01451ba531cf9fec46a95a9c67269b0907a6a11b,jul,2017,auto,auto,Irvington,774.500000,770.375000,700.000000,35.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,ff29a4f110b99c56bc7bb98688f62a60dc2d6382,jul,2021,cool,hold,Oyster Bay,737.017751,730.000000,730.000000,60.0,False,False,False
1037,ff7af27aaa6950c6ba52b7a7585516304d996fa9,jul,2021,cool,hold,Greenfield,692.201780,707.164194,707.161227,40.0,False,False,False
1038,ffac8c736a7d77c718747fc8eec9a847103d077f,jul,2021,auto,hold,New York,742.666456,742.758512,650.000000,45.0,False,False,False
1039,ffb2e05ac5e004ce171be4cfdd44498227d90db8,jul,2021,cool,hold,New City,748.938061,747.356373,745.644524,45.0,False,False,False


In [133]:
NY_jul.to_csv("Scraper_Output/State_Month_Day/NY/NY_jul.csv", header=True, index=False)

---

## August

### 2017 August Day

In [134]:
# Read in month csv for state
aug_2017 = pd.read_csv("../data_large/NY-day/2017-aug-day-NY.csv")

# aug_2017

In [135]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,2646678d22b62b888ff844974c51af14b1da880d,2017-08-11 19:30:00 UTC,cool,auto,747,740,700,NY,Verona,60,False,False,False,Gas
1,dce408fa8af3f40d4bf106938d03a38893a28bcb,2017-08-06 13:30:00 UTC,cool,hold,759,780,780,NY,Mount Vernon,106,False,False,False,Gas
2,9d8ebea68035fb275cf75598542a77881d427fc5,2017-08-23 14:05:00 UTC,cool,hold,731,740,740,NY,Warrensburg,35,False,False,False,Gas
3,654b788ae5da0a4e92f60314508e0cc0d65bf385,2017-08-03 18:05:00 UTC,cool,hold,775,800,790,NY,Bethpage,55,False,False,False,Gas
4,378ce7c187c2056313b45dd68fc1652a7f6adeb7,2017-08-19 15:40:00 UTC,cool,hold,751,760,760,NY,Buffalo,40,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673758,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-08-21 12:30:00 UTC,cool,hold,733,730,730,NY,Great Neck Estates,50,False,False,False,Gas
673759,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-08-03 19:00:00 UTC,cool,auto,732,730,730,NY,Great Neck Estates,50,False,False,False,Gas
673760,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-08-25 12:05:00 UTC,cool,auto,731,740,680,NY,Great Neck Estates,50,False,False,False,Gas
673761,7bd05db0f3d159fefccee7e4361fe8b0aae3598c,2017-08-30 11:05:00 UTC,cool,hold,723,740,740,NY,Great Neck Estates,50,False,False,False,Gas


In [136]:
# Add year and month

aug_2017["Year"] = "2017"
aug_2017["Month"] = "aug"

In [137]:
# Rename columns to label the aggregates

aug_2017 = aug_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [138]:
aug_2017_ave = aug_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2017_ave

In [139]:
# Export CSV file

aug_2017_ave.to_csv("data/day/NY/aug/aug_2017_ave.csv", header=True, index=True)

### 2018 August Day

In [140]:
# Read in month csv for state
aug_2018 = pd.read_csv("../data_large/NY-day/2018-aug-day-NY.csv")

# aug_2018

In [141]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,1d44a7a3ebdfe8d2f1a71535f319b7547022f6cc,2018-08-22 09:50:00 UTC,cool,auto,712,770,610,NY,Carmel,10,False,False,False,Gas
3,e4a293d7ab66d3d54d39e7478ecfd294a75fc98e,2018-08-01 12:05:00 UTC,auto,hold,721,725,675,NY,Rochester,100,False,False,False,Gas
4,4ab78547d3bb23bcd0757a061fb0aa52d3bd1cc6,2018-08-07 19:40:00 UTC,cool,auto,685,690,610,NY,Parish,0,True,False,False,Gas
5,1f903a0029f4fd4561f565921570492157b1723f,2018-08-05 13:30:00 UTC,heat,hold,716,608,608,NY,Callicoon,25,False,True,False,Gas
7,6a3125f03c90193cd2422584d2fa1d6931901dab,2018-08-23 15:15:00 UTC,heat,hold,759,672,672,NY,New York,36,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402958,2fb0a57aa7d428ed5fb032366e290d80ccf7e2bf,2018-08-26 14:10:00 UTC,cool,hold,730,760,760,NY,Pelham Manor,90,False,False,False,Gas
1402959,6a9da107d5c8f21074559c8b7e4f19e924eebd06,2018-08-22 18:50:00 UTC,cool,hold,765,760,760,NY,New York,0,False,False,False,Gas
1402960,a4a4ba18dad758e6a6ab4dca6bd5e0f5ee287eae,2018-08-24 15:50:00 UTC,cool,auto,717,740,760,NY,Staten Island,30,True,False,False,Gas
1402961,9b7a08345ec507fc3f885d6ec1033370c2fd7282,2018-08-01 10:45:00 UTC,cool,auto,758,770,760,NY,Pittsford,50,False,False,False,Gas


In [142]:
# Add year and month

aug_2018["Year"] = "2018"
aug_2018["Month"] = "aug"

In [143]:
# Rename columns to label the aggregates

aug_2018 = aug_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [144]:
aug_2018_ave = aug_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2018_ave

In [145]:
# Export CSV file

aug_2018_ave.to_csv("data/day/NY/aug/aug_2018_ave.csv", header=True, index=True)

### 2019 August Day

In [146]:
# Read in month csv for state
aug_2019 = pd.read_csv("../data_large/NY-day/2019-aug-day-NY.csv")

# aug_2019

In [147]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,292869d03577168770811bd3caa7d597e6dbb4c5,2019-08-04 15:45:00 UTC,cool,hold,749,745,745,NY,Depew,60,False,False,False,Gas
2,d7cb513d5d9bbd592a07ac86efbb042b2e443b20,2019-08-22 15:10:00 UTC,auto,hold,680,685,635,NY,Melville,15,False,False,False,Gas
4,d1dac7489ab745eb142d42541b883858478a0496,2019-08-21 15:05:00 UTC,auto,hold,726,732,672,NY,Somers,0,False,False,False,Gas
6,46f40f1d5ee732f09484d3ba1f2d045b06d91f3c,2019-08-04 11:55:00 UTC,cool,auto,695,710,645,NY,Highland,17,False,False,False,Gas
7,af104258d13f486c749bea5a8a2ce2609a669356,2019-08-13 13:25:00 UTC,cool,hold,736,669,669,NY,East Hills,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1805750,cfdeb339f0fc02bf493d525378d8b01363ecf6cf,2019-08-30 16:30:00 UTC,cool,hold,755,760,760,NY,New York,0,False,False,True,Electric
1805751,50584ebeaf60e8fa60b1a72a5ecf1cb0c3d7a400,2019-08-27 12:05:00 UTC,cool,hold,751,760,760,NY,Clarkstown,5,False,False,False,Gas
1805752,2143f21b1ff2c4ef1b4eff958ea93f090e003e50,2019-08-01 17:35:00 UTC,cool,auto,758,760,760,NY,New York,59,False,False,False,Gas
1805753,1d2d4f82f2bda26ce1ccd70ee5e3a44bfc27fc9a,2019-08-30 19:05:00 UTC,cool,hold,742,760,760,NY,Ballston,0,False,False,False,Gas


In [148]:
# Add year and month

aug_2019["Year"] = "2019"
aug_2019["Month"] = "aug"

In [149]:
# Rename columns to label the aggregates

aug_2019 = aug_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [150]:
aug_2019_ave = aug_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2019_ave

In [151]:
# Export CSV file

aug_2019_ave.to_csv("data/day/NY/aug/aug_2019_ave.csv", header=True, index=True)

### 2020 August Day

In [152]:
# Read in month csv for state
aug_2020 = pd.read_csv("../data_large/NY-day/2020-aug-day-NY.csv")

# aug_2020

In [153]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,39d6d4e1fdc416097485fd25ef70d9902f3ed8dd,2020-08-07 09:40:00 UTC,auto,auto,704,756,696,NY,Middle Grove,25,False,False,False,Gas
1,47f328464300e13e30989120c6f1705b43b77140,2020-08-24 17:05:00 UTC,cool,hold,708,711,711,NY,Bellmore,0,False,False,False,Gas
2,a17e2189508ea979c86bc112ec60b459d8e02982,2020-08-03 11:30:00 UTC,cool,hold,704,698,698,NY,Clifton Park,0,False,False,False,Gas
3,5532b9ff297f78d9e990533a1af63783b9bf35f1,2020-08-13 12:05:00 UTC,cool,hold,764,760,751,NY,Middletown,0,False,False,False,Gas
4,06878ebc6479aa003be1e7868bfa8f6272448662,2020-08-02 14:25:00 UTC,cool,hold,723,719,719,NY,East Hills,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1704625,98d4824bb8f285ab32a664e5fd3a3c0199ddf90e,2020-08-10 17:10:00 UTC,cool,auto,753,760,760,NY,Yonkers,70,False,False,False,Gas
1704626,7d46718119bee3dde487e5cc013cb3c6590f1a7b,2020-08-08 13:50:00 UTC,cool,auto,711,760,760,NY,Cornwall,50,False,False,False,Gas
1704627,1ccea1c76351aa558b22b9bfb0a11dc130d0e96a,2020-08-17 17:15:00 UTC,cool,auto,745,760,760,NY,Penfield,0,False,False,False,Gas
1704628,938d7df1ee594a38ed9aa76c28f4fb1bca230c4b,2020-08-18 15:35:00 UTC,cool,auto,763,760,760,NY,Rochester,0,True,False,False,Gas


In [154]:
# Add year and month

aug_2020["Year"] = "2020"
aug_2020["Month"] = "aug"

In [155]:
# Rename columns to label the aggregates

aug_2020 = aug_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [156]:
aug_2020_ave = aug_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2020_ave

In [157]:
# Export CSV file

aug_2020_ave.to_csv("data/day/NY/aug/aug_2020_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [158]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/aug/") if f.endswith(".csv")]

# files

In [159]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_aug = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/aug/" + file)
    NY_aug = pd.concat([NY_aug, df])
    
NY_aug

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00ae124983451237520586ba85955c62dce37df5,aug,2017,cool,hold,Massapequa,721.751275,720.963173,720.963173,60.0,False,False,False
1,0121a65bb6b5c42b127417008cd2f09a34da5f75,aug,2017,cool,auto,Malverne,776.863585,778.931229,652.712514,70.0,False,False,False
2,0121a65bb6b5c42b127417008cd2f09a34da5f75,aug,2017,cool,hold,Malverne,745.455390,748.322181,748.120198,70.0,False,False,False
3,01abbe67487f0559d85464ca7c072ccd0fdd6a54,aug,2017,cool,hold,New Hartford,735.575758,728.303030,724.636364,55.0,False,False,False
4,01c8d408d81af596e2f83117506bc83b17b16cf8,aug,2017,cool,auto,New York,738.601423,759.927046,755.508897,9.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,ffb2e05ac5e004ce171be4cfdd44498227d90db8,aug,2020,cool,hold,New City,738.119889,734.013860,733.896743,45.0,False,False,False
1961,ffe9e6321b8d90a7700b64229bd5f32098bc42f4,aug,2020,auto,auto,Hyde Park,738.880000,729.680000,670.000000,67.0,False,False,False
1962,ffe9e6321b8d90a7700b64229bd5f32098bc42f4,aug,2020,auto,hold,Hyde Park,706.818182,701.818182,621.818182,67.0,False,False,False
1963,fff8972a84c7d9efff59e9d2eaaa0a027e875d60,aug,2020,auto,auto,Port Washington,771.342960,773.205776,709.927798,20.0,False,False,False


In [160]:
NY_aug.to_csv("Scraper_Output/State_Month_Day/NY/NY_aug.csv", header=True, index=False)

---

## September

---

## October

---

## November

---

## December

### 2017 December Day

In [161]:
# Read in month csv for state
dec_2017 = pd.read_csv("../data_large/NY-day/2017-dec-day-NY.csv")

# dec_2017

In [162]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,7ded0e4a431fb2473a5eb144b737778016efd464,2017-12-27 12:20:00 UTC,heat,auto,661,662,662,NY,New York,5,False,False,False,Gas
1,78a444123ab48e329336bf58ff3f02342e37405e,2017-12-11 15:15:00 UTC,heat,hold,662,665,665,NY,New York,0,False,False,False,Gas
2,e2d43a159b8331302276ff9e8a39257f32081fb0,2017-12-08 18:55:00 UTC,heat,hold,522,656,656,NY,Easthampton,35,False,False,False,Gas
3,04ca00abd1cf2bde048982ac0b049c0c7d1f3797,2017-12-07 15:10:00 UTC,heat,auto,617,657,654,NY,Selkirk,15,False,False,False,Gas
4,4ef03b88b7b5bf8634691733eaa26a6b82f69bf3,2017-12-26 18:55:00 UTC,heat,hold,704,719,719,NY,Greenport,116,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106659,0603f7fe2f684cce51fa90570d7373a08e4e4ca2,2017-12-27 16:00:00 UTC,heat,hold,755,760,760,NY,Garden City Park,48,True,False,False,Gas
1106660,292869d03577168770811bd3caa7d597e6dbb4c5,2017-12-10 15:05:00 UTC,heat,hold,755,760,760,NY,Depew,60,False,False,False,Gas
1106661,3480c3fec8cdc0814006da09280d2938eccee8f2,2017-12-22 09:40:00 UTC,heat,auto,761,760,760,NY,OZONE PARK,100,False,False,False,Gas
1106662,0603f7fe2f684cce51fa90570d7373a08e4e4ca2,2017-12-27 16:45:00 UTC,heat,hold,754,760,760,NY,Garden City Park,48,True,False,False,Gas


In [163]:
# Add year and month

dec_2017["Year"] = "2017"
dec_2017["Month"] = "dec"

In [164]:
# Rename columns to label the aggregates

dec_2017 = dec_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [165]:
dec_2017_ave = dec_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2017_ave

In [166]:
# Export CSV file

dec_2017_ave.to_csv("data/day/NY/dec/dec_2017_ave.csv", header=True, index=True)

### 2018 December Day

In [167]:
# Read in month csv for state
dec_2018 = pd.read_csv("../data_large/NY-day/2018-dec-day-NY.csv")

# dec_2018

In [168]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,92609addf20fcec2b8dadee951a7c5981fa58ecf,2018-12-28 16:45:00 UTC,heat,hold,676,679,679,NY,New York,40,False,False,False,Gas
3,17f611e9f069e7e4a08d9c4e52d7b4669b198e72,2018-12-30 13:20:00 UTC,heat,hold,654,655,655,NY,Larchmont,80,False,False,False,Gas
4,937ef1884f1e5c7eae3e7034a40c37520c8ff548,2018-12-21 19:15:00 UTC,heat,hold,695,698,698,NY,New York,0,False,False,False,Gas
5,7c46f86ec171fb4d34bd2a7fd5892c68717ec1f4,2018-12-08 13:25:00 UTC,auto,auto,712,775,710,NY,Manlius,0,False,False,False,Gas
6,4fde3973dff9e99a3a181cdd941b263a7938caf4,2018-12-18 17:05:00 UTC,auto,hold,668,840,670,NY,Chappaqua,38,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663267,5992e99509cb8f9b46f8ed25a4f0e1443b771d8f,2018-12-17 17:30:00 UTC,heat,hold,757,760,760,NY,New York,118,False,False,False,Gas
1663268,d65a403f916824b113899be3ba69051864f8b044,2018-12-03 12:45:00 UTC,heat,auto,786,760,760,NY,Huntington Bay,85,False,False,False,Gas
1663269,f84f6429f586b2dd6b08d9e8732859f503280602,2018-12-31 19:10:00 UTC,auto,hold,704,760,670,NY,Babylon,50,False,False,False,Gas
1663270,f84f6429f586b2dd6b08d9e8732859f503280602,2018-12-26 14:50:00 UTC,auto,hold,704,760,700,NY,Babylon,50,False,False,False,Gas


In [169]:
# Add year and month

dec_2018["Year"] = "2018"
dec_2018["Month"] = "dec"

In [170]:
# Rename columns to label the aggregates

dec_2018 = dec_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [171]:
dec_2018_ave = dec_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2018_ave

In [172]:
# Export CSV file

dec_2018_ave.to_csv("data/day/NY/dec/dec_2018_ave.csv", header=True, index=True)

### 2019 December Day

In [173]:
# Read in month csv for state
dec_2019 = pd.read_csv("../data_large/NY-day/2019-dec-day-NY.csv")

# dec_2019

In [174]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,7ded0e4a431fb2473a5eb144b737778016efd464,2019-12-07 14:20:00 UTC,heat,hold,697,698,698,NY,New York,5,False,False,False,Gas
1,652b48879974f8e28937dc6fa63132aa1c94acf7,2019-12-08 19:10:00 UTC,heat,hold,701,705,705,NY,Huntington,60,False,False,False,Gas
2,fc2554dc055755356a49aca66be600073e65cc42,2019-12-08 18:40:00 UTC,heat,hold,677,701,680,NY,Ronkonkoma,10,True,False,False,Gas
3,c8880d3bf569b5447eea8128d493bdcb3ba5497b,2019-12-05 19:15:00 UTC,heat,hold,648,704,653,NY,Manlius,15,False,False,False,Gas
4,2366159a5da3a846ba06785dcad2434389f75604,2019-12-28 12:05:00 UTC,heat,hold,666,662,662,NY,Wappingers Falls,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1861818,cad4a255ea99121256ec2be08cc2c9ca567f6ba2,2019-12-16 15:55:00 UTC,heat,hold,760,760,760,NY,Buffalo,95,False,False,False,Gas
1861819,82f1e853c293e5c2ce4cbe2d363a9d6a2878ec57,2019-12-03 11:15:00 UTC,auto,hold,666,760,670,NY,Manlius,0,False,False,False,Gas
1861820,10c28461873adc35141df26bed3208a8bf5c59e5,2019-12-02 19:40:00 UTC,heat,auto,755,760,760,NY,Merrick,0,False,False,False,Gas
1861821,668d732f50c033bd0592a501120e8e8e46f95408,2019-12-07 12:10:00 UTC,heat,auto,719,760,720,NY,Rochester,0,False,False,False,Gas


In [175]:
# Add year and month

dec_2019["Year"] = "2019"
dec_2019["Month"] = "dec"

In [176]:
# Rename columns to label the aggregates

dec_2019 = dec_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [177]:
dec_2019_ave = dec_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2019_ave

In [178]:
# Export CSV file

dec_2019_ave.to_csv("data/day/NY/dec/dec_2019_ave.csv", header=True, index=True)

### 2020 December Day

In [179]:
# Read in month csv for state
dec_2020 = pd.read_csv("../data_large/NY-day/2020-dec-day-NY.csv")

# dec_2020

In [180]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,28af4185c9f7a9f0973b8f3dcab239968019c7e0,2020-12-11 12:20:00 UTC,heat,hold,710,716,716,NY,Buffalo,120,False,False,False,Gas
1,7d1c1110ea36c03f156c9a1414d570688eac5de4,2020-12-20 16:20:00 UTC,heat,hold,691,708,708,NY,New York,90,True,False,True,Electric
2,e7bb30d05ddbec5ff8347f4b5e5b5b8a7640988c,2020-12-21 18:15:00 UTC,auto,hold,708,757,707,NY,Poughkeepsie,10,False,False,False,Gas
4,3749a00867560be8d5a8ba88b04dcb650719db02,2020-12-17 18:50:00 UTC,heat,hold,607,827,827,NY,Farmingvilli,0,False,False,False,Gas
5,a501b4cb07f0a30e48f9dcb98ee17f87ae909317,2020-12-10 16:25:00 UTC,heat,hold,681,685,685,NY,Onondaga,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586205,2059b26535651c6116acaf9c92b379c7922cdf74,2020-12-24 15:15:00 UTC,heat,hold,702,760,760,NY,Cowlesville,0,False,False,False,Gas
1586206,f880bde05f8cc5e1d45c535627f29bb8b584f032,2020-12-15 18:25:00 UTC,auto,auto,724,760,730,NY,New York,5,False,False,True,Electric
1586207,b1298ad3d77d1ae4e61e6d3ed78ac1d3368522b5,2020-12-20 17:50:00 UTC,heat,auto,717,760,720,NY,Penfield,15,False,False,False,Gas
1586208,f36b4673f0e869bf94cd430e996d18a6f69f443a,2020-12-12 17:05:00 UTC,heat,hold,730,760,760,NY,Huntington Bay,90,False,False,False,Gas


In [181]:
# Add year and month

dec_2020["Year"] = "2020"
dec_2020["Month"] = "dec"

In [182]:
# Rename columns to label the aggregates

dec_2020 = dec_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [183]:
dec_2020_ave = dec_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2020_ave

In [184]:
# Export CSV file

dec_2020_ave.to_csv("data/day/NY/dec/dec_2020_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [185]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/NY/dec/") if f.endswith(".csv")]

# files

In [186]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_dec = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/NY/dec/" + file)
    NY_dec = pd.concat([NY_dec, df])
    
NY_dec

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,0057073590b36f6c235dab2cc53cc8c001185b2b,dec,2017,heat,auto,Clifton Park,697.823834,692.595855,692.569948,60.0,False,False,False
1,0057073590b36f6c235dab2cc53cc8c001185b2b,dec,2017,heat,hold,Clifton Park,704.686401,708.262720,708.080481,60.0,False,False,False
2,00996606dc340bb60b08b0704486388082dc5b1f,dec,2017,heat,auto,Buffalo,666.903340,670.452652,670.442436,110.0,False,False,False
3,00996606dc340bb60b08b0704486388082dc5b1f,dec,2017,heat,hold,Buffalo,662.327586,666.568966,666.568966,110.0,False,False,False
4,00ae124983451237520586ba85955c62dce37df5,dec,2017,heat,auto,Massapequa,697.666667,700.000000,700.000000,60.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2415,ffac8c736a7d77c718747fc8eec9a847103d077f,dec,2020,auto,hold,New York,739.000000,726.000000,685.000000,45.0,False,False,False
2416,ffac8c736a7d77c718747fc8eec9a847103d077f,dec,2020,heat,auto,New York,727.596022,727.236626,727.236626,45.0,False,False,False
2417,ffac8c736a7d77c718747fc8eec9a847103d077f,dec,2020,heat,hold,New York,728.163586,717.730129,717.730129,45.0,False,False,False
2418,ffb2e05ac5e004ce171be4cfdd44498227d90db8,dec,2020,heat,hold,New City,731.739844,734.781250,734.781250,45.0,False,False,False


In [187]:
NY_dec.to_csv("Scraper_Output/State_Month_Day/NY/NY_dec.csv", header=True, index=False)

----

----

---

### Combine state CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [188]:
# Create variable for files in directory
files = [f for f in os.listdir("Scraper_Output/State_Month_Day/NY/") if f.endswith(".csv")]

# files

In [189]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
NY_all = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("Scraper_Output/State_Month_Day/NY/" + file)
    NY_all = pd.concat([NY_all, df])
    
NY_all

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,00ae124983451237520586ba85955c62dce37df5,aug,2017,cool,hold,Massapequa,721.751275,720.963173,720.963173,60.0,False,False,False
1,0121a65bb6b5c42b127417008cd2f09a34da5f75,aug,2017,cool,auto,Malverne,776.863585,778.931229,652.712514,70.0,False,False,False
2,0121a65bb6b5c42b127417008cd2f09a34da5f75,aug,2017,cool,hold,Malverne,745.455390,748.322181,748.120198,70.0,False,False,False
3,01abbe67487f0559d85464ca7c072ccd0fdd6a54,aug,2017,cool,hold,New Hartford,735.575758,728.303030,724.636364,55.0,False,False,False
4,01c8d408d81af596e2f83117506bc83b17b16cf8,aug,2017,cool,auto,New York,738.601423,759.927046,755.508897,9.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8921,ff29a4f110b99c56bc7bb98688f62a60dc2d6382,jun,2021,cool,hold,Oyster Bay,734.713217,728.241895,728.241895,60.0,False,False,False
8922,ff7af27aaa6950c6ba52b7a7585516304d996fa9,jun,2021,cool,hold,Greenfield,709.322344,712.897436,712.893773,40.0,False,False,False
8923,ffac8c736a7d77c718747fc8eec9a847103d077f,jun,2021,auto,hold,New York,734.613119,738.661312,652.597055,45.0,False,False,False
8924,ffb2e05ac5e004ce171be4cfdd44498227d90db8,jun,2021,cool,hold,New City,743.166667,740.758961,740.738351,45.0,False,False,False


In [190]:
NY_all.to_csv("Scraper_Output/State_Month_Day/NY_all_day.csv", header=True, index=False)

In [191]:
# Datacheck to mNYe sure state was selected correctly in BQ sql queries

print(f"Unique jan_2017: {jan_2017['ProvinceState'].unique()}")
print(f"Unique jan_2018: {jan_2018['ProvinceState'].unique()}")
print(f"Unique jan_2019: {jan_2019['ProvinceState'].unique()}")
print(f"Unique jan_2020: {jan_2020['ProvinceState'].unique()}")
print(f"Unique jan_2021: {jan_2021['ProvinceState'].unique()}")
print(f"Unique feb_2017: {feb_2017['ProvinceState'].unique()}")
print(f"Unique feb_2018: {feb_2018['ProvinceState'].unique()}")
print(f"Unique feb_2019: {feb_2019['ProvinceState'].unique()}")
print(f"Unique feb_2020: {feb_2020['ProvinceState'].unique()}")
print(f"Unique feb_2021: {feb_2021['ProvinceState'].unique()}")
print(f"Unique jun_2017: {jun_2017['ProvinceState'].unique()}")
print(f"Unique jun_2018: {jun_2018['ProvinceState'].unique()}")
print(f"Unique jun_2019: {jun_2019['ProvinceState'].unique()}")
print(f"Unique jun_2020: {jun_2020['ProvinceState'].unique()}")
print(f"Unique jun_2021: {jun_2021['ProvinceState'].unique()}")
print(f"Unique jul_2017: {jul_2017['ProvinceState'].unique()}")
print(f"Unique jul_2018: {jul_2018['ProvinceState'].unique()}")
print(f"Unique jul_2019: {jul_2019['ProvinceState'].unique()}")
print(f"Unique jul_2020: {jul_2020['ProvinceState'].unique()}")
print(f"Unique jul_2021: {jul_2021['ProvinceState'].unique()}")
print(f"Unique aug_2017: {aug_2017['ProvinceState'].unique()}")
print(f"Unique aug_2018: {aug_2018['ProvinceState'].unique()}")
print(f"Unique aug_2019: {aug_2019['ProvinceState'].unique()}")
print(f"Unique aug_2020: {aug_2020['ProvinceState'].unique()}")
print(f"Unique dec_2017: {dec_2017['ProvinceState'].unique()}")
print(f"Unique dec_2018: {dec_2018['ProvinceState'].unique()}")
print(f"Unique dec_2019: {dec_2019['ProvinceState'].unique()}")
print(f"Unique dec_2020: {dec_2020['ProvinceState'].unique()}")

Unique jan_2017: ['NY']
Unique jan_2018: ['NY']
Unique jan_2019: ['NY']
Unique jan_2020: ['NY']
Unique jan_2021: ['NY']
Unique feb_2017: ['NY']
Unique feb_2018: ['NY']
Unique feb_2019: ['NY']
Unique feb_2020: ['NY']
Unique feb_2021: ['NY']
Unique jun_2017: ['NY']
Unique jun_2018: ['NY']
Unique jun_2019: ['NY']
Unique jun_2020: ['NY']
Unique jun_2021: ['NY']
Unique jul_2017: ['NY']
Unique jul_2018: ['NY']
Unique jul_2019: ['NY']
Unique jul_2020: ['NY']
Unique jul_2021: ['NY']
Unique aug_2017: ['NY']
Unique aug_2018: ['NY']
Unique aug_2019: ['NY']
Unique aug_2020: ['NY']
Unique dec_2017: ['NY']
Unique dec_2018: ['NY']
Unique dec_2019: ['NY']
Unique dec_2020: ['NY']
