# DYD Thermostat Data 

## Preprocess

1. Generated CSV file from queries in BigQueary

2. Data separated into states

3. Aggregated the data in Pandas by month

4. Combine 4 years

5. Group by Identifier



In [1]:
# Dependencies
import pandas as pd
import os
import numpy as np
from pathlib import Path
from datetime import datetime

---
## January

### 2017 January Day

In [2]:
# Read in month csv for state
jan_2017 = pd.read_csv("../data_large/CA-day/2017-jan-day-CA.csv")

# jan_2017

In [3]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2017.drop(jan_2017[jan_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,c3c911d32ef968b328a23fa926a873e34ae3c8b1,2017-01-04 15:30:00 UTC,heat,hold,695,712,712,CA,Granite Bay,26,False,False,False,Gas
1,11e2b6321be7d24f65aec1d0face0a8e28093e22,2017-01-30 15:05:00 UTC,heat,hold,689,715,715,CA,La Mirada,0,False,False,False,Gas
2,7ab1d598dcae665e7c0d8b1ec7f3aceb2c677b3e,2017-01-25 14:50:00 UTC,auto,auto,648,685,635,CA,San Diego,0,False,False,False,Gas
3,578f376ca60259788b6c3b50cf1b60085cd1a989,2017-01-24 15:55:00 UTC,auto,hold,680,815,680,CA,San Jose,5,False,False,False,Gas
4,a8cf9ec2d96747648fd94ebd4edf7a19b15ec2f4,2017-01-04 07:10:00 UTC,auto,auto,737,785,735,CA,????,56,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045362,a567b82369a68c588c9b098c520f2fa14fa7ae91,2017-01-08 18:00:00 UTC,auto,hold,710,765,715,CA,Milpitas,60,False,False,False,Gas
1045363,6edf812205e7a38ce079d46fefa9299bcf012b33,2017-01-23 15:30:00 UTC,auto,hold,706,765,705,CA,Encinitas,0,False,False,False,Gas
1045364,aef1fdf8f8a4ada110ed8fa3a34eb371a731da1e,2017-01-26 17:20:00 UTC,auto,hold,681,765,685,CA,Wildomar,30,True,False,False,Gas
1045365,07e1d4139d075831aeb06ec4afaf45915254402e,2017-01-28 14:45:00 UTC,auto,auto,711,765,715,CA,Lemoore,5,False,False,False,Gas


In [4]:
# Add year and month

jan_2017["Year"] = "2017"
jan_2017["Month"] = "Jan"

In [5]:
# Rename columns to label the aggregates

jan_2017 = jan_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [6]:
jan_2017_ave = jan_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

jan_2017_ave

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
Identifier,Month,Year,HvacMode,CalendarEvent,City,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
002aab87db1d9e8ce470f80f23c97c59de5f4dcd,Jan,2017,heat,auto,San Marcos,688.352941,700.277311,685.243697,15.0,False,False,False
002aab87db1d9e8ce470f80f23c97c59de5f4dcd,Jan,2017,heat,hold,San Marcos,700.931973,700.000000,679.455782,15.0,False,False,False
0050d013d4928f706c90379b28f9ab257178f1a8,Jan,2017,heat,auto,Dublin,669.000000,682.692308,676.153846,5.0,False,False,False
0050d013d4928f706c90379b28f9ab257178f1a8,Jan,2017,heat,hold,Dublin,650.807692,653.865385,639.250000,5.0,False,False,False
007f930abce4596b2214cd79a09714b3d482e17d,Jan,2017,heat,hold,Escondido,727.000000,796.000000,730.000000,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
ff7fa20dc4a160fba797e8a11b8b47ccc8494686,Jan,2017,heat,hold,San Diego,691.163158,665.352632,647.900000,55.0,False,False,False
ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,Jan,2017,heat,auto,Chico,679.679191,651.667630,651.667630,0.0,False,False,False
ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,Jan,2017,heat,hold,Chico,682.129797,667.002257,666.945824,0.0,False,False,False
fff157010dde22d7786354c4f9c5a2ea80365f5c,Jan,2017,heat,auto,Ladera Ranch,726.733656,723.428571,723.428571,0.0,False,False,False


In [7]:
# Export CSV file

jan_2017_ave.to_csv("data/day/CA/jan/jan_2017_ave.csv", header=True, index=True)

### 2018 January Day

In [8]:
# Read in month csv for state
jan_2018 = pd.read_csv("../data_large/CA-day/2018-jan-day-CA.csv")

# jan_2018

In [9]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2018.drop(jan_2018[jan_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,4b15e373242ca1f170dbe5cbc6c987302fbe4a0f,2018-01-08 18:50:00 UTC,auto,hold,685,766,686,CA,Madera,10,False,False,False,Gas
1,f16e1a6c0dd8ab904ed439d4c7024d406054792d,2018-01-23 16:00:00 UTC,auto,hold,739,815,745,CA,Simi Valley,60,False,False,False,Gas
2,ed708891bbdb5bcabc8a0f3c384a05e151d20614,2018-01-21 15:35:00 UTC,heat,hold,664,665,665,CA,Huntington Beach,37,False,False,False,Gas
3,4eea7f9b8141b0dea251c535c290fb07e823bd33,2018-01-18 15:40:00 UTC,heat,hold,681,684,684,CA,Laguna Niguel,0,False,False,False,Gas
4,3e4ff830f6d2121599881b594a9bba483c1ce7d8,2018-01-29 15:15:00 UTC,auto,auto,711,685,655,CA,San Diego,90,True,False,True,Electric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284255,570fc567bcd25890bde663bdaa17fc40eb7fe81e,2018-01-09 14:55:00 UTC,auto,hold,716,765,715,CA,San Diego,5,False,False,False,Gas
2284256,f33dc2a8f4114bfe597d361a068151059bad45cf,2018-01-09 16:20:00 UTC,auto,hold,717,765,715,CA,Los Angeles,49,True,False,False,Gas
2284257,570fc567bcd25890bde663bdaa17fc40eb7fe81e,2018-01-07 14:20:00 UTC,auto,auto,706,765,705,CA,San Diego,5,False,False,False,Gas
2284258,570fc567bcd25890bde663bdaa17fc40eb7fe81e,2018-01-26 17:55:00 UTC,auto,hold,715,765,715,CA,San Diego,5,False,False,False,Gas


In [10]:
# Add year and month

jan_2018["Year"] = "2018"
jan_2018["Month"] = "Jan"


In [11]:
# Rename columns to label the aggregates

jan_2018 = jan_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [12]:
jan_2018_ave = jan_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2018_ave

In [13]:
# Export CSV file

jan_2018_ave.to_csv("data/day/CA/jan/jan_2018_ave.csv", header=True, index=True)

### 2019 January Day

In [14]:
# Read in month csv for state
jan_2019 = pd.read_csv("../data_large/CA-day/2019-jan-day-CA.csv")

# jan_2019

In [15]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2019.drop(jan_2019[jan_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,98fee782c5bda6669a41d076d951c16fc5420caa,2019-01-20 16:55:00 UTC,auto,hold,717,795,720,CA,Rancho Mirage,30,False,False,False,Gas
1,a3081fce8b5cd9b06c7a4cff3a2ca3f56b305d16,2019-01-31 15:00:00 UTC,auto,hold,727,775,725,CA,San Diego,0,False,False,False,Gas
2,7f879600f0cff30ae1965dbe9a18d8638ace6eaa,2019-01-16 07:25:00 UTC,auto,hold,683,729,642,CA,Menlo Park,10,False,False,False,Gas
3,17e94d1deb59ffd446194e73f91965fa5010b847,2019-01-11 15:55:00 UTC,auto,hold,653,836,611,CA,Santa Ana,60,False,False,False,Gas
5,07ed43bb8770fc3d1cf1f7dbb1398a870be984a9,2019-01-13 13:10:00 UTC,heat,hold,709,719,710,CA,Tracy,5,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3605222,07a9f6fe101330e2f5d9e3ce7edd8866f9c1f088,2019-01-06 08:35:00 UTC,auto,auto,676,760,650,CA,Castro Valley,55,True,False,False,Gas
3605223,cef32685bc5887e99d38bb01c29d80f416d9d34c,2019-01-26 09:20:00 UTC,auto,auto,693,760,700,CA,San Diego,35,False,False,True,Electric
3605224,b010485db5726043a7ebe50f43912e7f6e01c1f9,2019-01-29 17:20:00 UTC,heat,hold,756,760,760,CA,Suisun City,25,False,False,False,Gas
3605225,979cf9fb6816f7071e2dae929d521ab4787e3409,2019-01-05 17:00:00 UTC,auto,auto,686,760,690,CA,Mission Viejo,15,False,False,False,Gas


In [16]:
# Add year and month

jan_2019["Year"] = "2019"
jan_2019["Month"] = "Jan"


In [17]:
# Rename columns to label the aggregates

jan_2019 = jan_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [18]:
jan_2019_ave = jan_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2019_ave

In [19]:
# Export CSV file

jan_2019_ave.to_csv("data/day/CA/jan/jan_2019_ave.csv", header=True, index=True)

### 2020 January Day

In [20]:
# Read in month csv for state
jan_2020 = pd.read_csv("../data_large/CA-day/2020-jan-day-CA.csv")

# jan_2020

In [21]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2020.drop(jan_2020[jan_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,02b6640c479be7ca15b050180d24ee4220b5b512,2020-01-11 16:10:00 UTC,heat,auto,685,690,690,CA,Canyon Country,10,False,False,False,Gas
2,82594cabb821b78a80b33e6019314f23ddeb8988,2020-01-23 16:50:00 UTC,heat,hold,663,660,660,CA,San Diego,20,False,False,False,Gas
3,c2d64a0ecdcc913a76b9c59f3541ccdd668bf7c2,2020-01-17 18:45:00 UTC,heat,auto,719,710,720,CA,Berkeley,107,False,False,False,Gas
4,cdcb5b779271ffc2ed951e8ff86f60453edcdd1e,2020-01-01 17:40:00 UTC,heat,auto,707,710,710,CA,Daly City,0,False,False,False,Gas
5,68842419dc8f809136fe38505991692e253103f0,2020-01-08 16:40:00 UTC,heat,hold,708,710,710,CA,Los Angeles,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3882980,e54028e54532f5a5e098af9813ced16cb8712e28,2020-01-23 15:00:00 UTC,heat,hold,721,725,725,CA,Irvine,9,True,False,False,Gas
3882981,ef3f4cf01293b56f8d5b95caae527e5f54ac8ce0,2020-01-01 18:50:00 UTC,auto,hold,634,739,639,CA,San Jose,50,False,False,False,Gas
3882982,7ef3b42571f9cbcfee6bb679415f2436f30fc363,2020-01-30 18:45:00 UTC,auto,auto,644,720,650,CA,Fresno,60,False,False,False,Gas
3882983,792b6719b1f24d6a08674d674716e8f334af71f0,2020-01-02 07:05:00 UTC,heat,hold,691,690,690,CA,Canyon Lake,30,False,False,True,Electric


In [22]:
# Add year and month

jan_2020["Year"] = "2020"
jan_2020["Month"] = "Jan"


In [23]:
# Rename columns to label the aggregates

jan_2020 = jan_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [24]:
jan_2020_ave = jan_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2020_ave

In [25]:
# Export CSV file

jan_2020_ave.to_csv("data/day/CA/jan/jan_2020_ave.csv", header=True, index=True)

### 2021 January Day

In [26]:
# Read in month csv for state
jan_2021 = pd.read_csv("../data_large/CA-day/2021-jan-day-CA.csv")

# jan_2021

In [27]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
jan_2021.drop(jan_2021[jan_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jan_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,cbfd9f933bf4333c81101bfc3432d933f1498818,2021-01-26 19:10:00 UTC,heat,hold,632,735,735,CA,Selma,19,False,False,False,Gas
1,fde2c5d805393795a40fc530cf4e8cc0980de7b1,2021-01-27 18:50:00 UTC,auto,hold,662,775,645,CA,Jurupa Valley,0,True,False,False,Gas
2,155a0954ffe00d885e978a54e2b29b049fc8e9d5,2021-01-16 17:30:00 UTC,auto,hold,716,849,620,CA,San Jose,0,False,False,False,Gas
3,20230efb3214466f8c5fcad3f6d243274b8b3358,2021-01-30 17:15:00 UTC,auto,hold,655,735,660,CA,Stanton,49,False,False,False,Gas
4,36497337558d1292730dd5db54219722a51e86e2,2021-01-17 12:35:00 UTC,heat,hold,734,685,685,CA,Ventura,35,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2239390,009e91eaed297a685d53a6a82b63ab2123750dc8,2021-01-13 19:30:00 UTC,auto,hold,663,760,660,CA,Anaheim,0,False,False,False,Gas
2239391,4c6567cbab076f7596085660b8a6694c026a7d38,2021-01-13 11:00:00 UTC,heat,hold,665,760,670,CA,san diego,45,False,False,False,Gas
2239392,d3ead6cff4181cb6eae29d470ac9f1cd72bfef58,2021-01-15 19:55:00 UTC,auto,hold,679,760,680,CA,Antioch,58,False,False,False,Gas
2239393,2dfbd1dd5acd0ae6057abc7a173fd4d30ff8366f,2021-01-10 15:30:00 UTC,auto,hold,710,760,710,CA,Simi Valley,15,False,False,False,Gas


In [28]:
# Add year and month

jan_2021["Year"] = "2021"
jan_2021["Month"] = "Jan"


In [29]:
# Rename columns to label the aggregates

jan_2021 = jan_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [30]:
jan_2021_ave = jan_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jan_2021_ave

In [31]:
# Export CSV file

jan_2021_ave.to_csv("data/day/CA/jan/jan_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [32]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/jan/") if f.endswith(".csv")]

# files

In [33]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_jan = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/jan/" + file)
    CA_jan = pd.concat([CA_jan, df])
    
CA_jan

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,Jan,2017,heat,auto,San Marcos,688.352941,700.277311,685.243697,15.0,False,False,False
1,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,Jan,2017,heat,hold,San Marcos,700.931973,700.000000,679.455782,15.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,Jan,2017,heat,auto,Dublin,669.000000,682.692308,676.153846,5.0,False,False,False
3,0050d013d4928f706c90379b28f9ab257178f1a8,Jan,2017,heat,hold,Dublin,650.807692,653.865385,639.250000,5.0,False,False,False
4,007f930abce4596b2214cd79a09714b3d482e17d,Jan,2017,heat,hold,Escondido,727.000000,796.000000,730.000000,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,ffab10c348bdc935f2b26c06b99e75d237f5cead,Jan,2021,heat,hold,Visalia,678.816794,680.000000,680.000000,97.0,False,False,False
4597,ffc5939f7005156793fd751db9de2b35320a97e6,Jan,2021,auto,hold,Los Angeles,672.084677,790.000000,670.000000,59.0,False,False,False
4598,ffd99f8a17ac8f96739ceba7c0feca750b886605,Jan,2021,heat,hold,San Diego,679.697479,681.742297,681.675070,40.0,False,False,False
4599,ffdb74abccd27a50a47d411800653c4b0e702b49,Jan,2021,heat,hold,San Diego,671.068702,669.870229,656.393130,9.0,True,False,False


In [34]:
CA_jan.to_csv("Scraper_Output/State_Month_Day/CA/CA_jan.csv", header=True, index=False)

---

## February

### 2017 February Day

In [35]:
# Read in month csv for state
feb_2017 = pd.read_csv("../data_large/CA-day/2017-feb-day-CA.csv")

# feb_2017

In [36]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2017.drop(feb_2017[feb_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,45254089f32bbd9355ddb45ca91a4940a655a361,2017-02-23 18:20:00 UTC,auto,auto,666,715,665,CA,Folsom,15,False,False,False,Gas
1,c8309655a8242cf7ae0091329ae2e7b31f05c80b,2017-02-25 16:05:00 UTC,auto,auto,634,705,655,CA,Del Mar,35,False,False,True,Electric
2,6b84763d23ae963ad3963c77ce2bea62ca4a8aae,2017-02-01 09:15:00 UTC,heat,hold,765,752,752,CA,Azusa,0,True,False,True,Electric
3,b5d54e8251d790a84bd551fa3478ae49c4964da3,2017-02-05 17:45:00 UTC,auto,hold,694,840,690,CA,Carlsbad,15,False,False,False,Gas
4,d07ad1bc2038ab86abf5e29dda0c89c4f0e6f681,2017-02-05 18:00:00 UTC,heat,hold,682,684,684,CA,Anaheim,0,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902737,0722d8c9c144b0040b5b3e2382cf41a73fd0630e,2017-02-22 12:30:00 UTC,heat,hold,766,760,760,CA,Glendale,45,False,False,False,Gas
902738,e77a4f05b42341eaa2824239d18fed4849e4ebe7,2017-02-06 17:25:00 UTC,auto,hold,720,760,690,CA,Aliso Viejo,0,False,False,False,Gas
902739,243532ea22e627e7b3eb8149403e4b755c443a6f,2017-02-26 17:05:00 UTC,auto,hold,675,760,680,CA,Rancho Cucamonga,46,False,False,False,Gas
902740,260911cc7cb77afc05843901f4370cd5711cab7b,2017-02-22 16:20:00 UTC,heat,auto,716,760,680,CA,Huntington Beach,20,False,False,False,Gas


In [37]:
# Add year and month

feb_2017["Year"] = "2017"
feb_2017["Month"] = "feb"

In [38]:
# Rename columns to label the aggregates

feb_2017 = feb_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [39]:
feb_2017_ave = feb_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2017_ave

In [40]:
# Export CSV file

feb_2017_ave.to_csv("data/day/CA/feb/feb_2017_ave.csv", header=True, index=True)

### 2018 February Day

In [41]:
# Read in month csv for state
feb_2018 = pd.read_csv("../data_large/CA-day/2018-feb-day-CA.csv")

# feb_2018

In [42]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2018.drop(feb_2018[feb_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,3ae161afd0cae357589532af4b1c92154e5fc367,2018-02-27 16:45:00 UTC,heat,hold,723,705,705,CA,san jose,40,False,False,False,Gas
1,be83902be5186eb6db59c8469b8650da4d85fe3e,2018-02-06 14:35:00 UTC,heat,hold,721,689,689,CA,San Diego,0,False,False,False,Gas
2,73acfdfc0fc3598d661c949e2f5656ec0d90d953,2018-02-26 16:25:00 UTC,heat,hold,684,685,685,CA,Dana Point,0,False,False,False,Gas
5,4b15e373242ca1f170dbe5cbc6c987302fbe4a0f,2018-02-17 15:05:00 UTC,auto,hold,691,766,696,CA,Madera,10,False,False,False,Gas
6,3c3eb47c6c9edb00e10a183812e09757c6be4ef5,2018-02-01 17:55:00 UTC,auto,hold,735,785,735,CA,Oakley,19,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121603,64dca3d3cc71ca93f42af1565021bec53a52afbd,2018-02-21 07:00:00 UTC,auto,hold,712,765,715,CA,Yuba City,45,False,False,False,Gas
2121604,daccf2379e81b00365e277c53232cd92872d0ffc,2018-02-06 11:45:00 UTC,auto,hold,721,765,715,CA,Yorba Linda,35,False,False,False,Gas
2121605,daccf2379e81b00365e277c53232cd92872d0ffc,2018-02-06 09:20:00 UTC,auto,hold,730,765,715,CA,Yorba Linda,35,False,False,False,Gas
2121606,ce5b8c6cad13dab2df13fe5dac0dfd8e7247fd54,2018-02-07 19:10:00 UTC,cool,hold,727,765,765,CA,Indio,0,False,False,False,Gas


In [43]:
# Add year and month

feb_2018["Year"] = "2018"
feb_2018["Month"] = "feb"


In [44]:
# Rename columns to label the aggregates

feb_2018 = feb_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [45]:
feb_2018_ave = feb_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2018_ave

In [46]:
# Export CSV file

feb_2018_ave.to_csv("data/day/CA/feb/feb_2018_ave.csv", header=True, index=True)

### 2019 February Day

In [47]:
# Read in month csv for state
feb_2019 = pd.read_csv("../data_large/CA-day/2019-feb-day-CA.csv")

# feb_2019

In [48]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2019.drop(feb_2019[feb_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,160a0ddccca14d4e33c84a573c21bbb8734f007a,2019-02-26 18:15:00 UTC,heat,hold,694,695,695,CA,Citrus Heights,60,False,False,False,Gas
2,28fbb1b28f80b0a653b828b7e493630da46bdaf5,2019-02-25 07:20:00 UTC,heat,hold,733,695,695,CA,Sierra Madre,50,False,False,False,Gas
3,42f02aace9cf00e2fe17bede69b29f0ca2fe9604,2019-02-21 17:45:00 UTC,auto,hold,722,784,719,CA,San Diego,5,False,False,False,Gas
4,3633e86ecf6a2db403c0aeabc30b730e88e4f0ba,2019-02-01 15:50:00 UTC,auto,hold,722,815,725,CA,San Jose,20,False,False,False,Gas
5,0da28b13780f05f670ff1869a7507d45191e2fad,2019-02-03 15:10:00 UTC,heat,hold,698,698,698,CA,Palm Springs,19,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2482755,0fa6ce44d6c7ea62ea3197cb2c67566a98511582,2019-02-08 07:05:00 UTC,auto,hold,662,760,630,CA,Bakersfield,0,True,False,False,Gas
2482756,533b68fa31ecf2adcc66474460d4934c9680e1fe,2019-02-20 19:30:00 UTC,auto,hold,689,760,690,CA,Walnut Creek,10,False,False,False,Gas
2482757,9b295b5343ee3481373afe387e8129bd56979085,2019-02-15 14:30:00 UTC,auto,auto,693,760,690,CA,Roseville,20,False,False,False,Gas
2482758,c3a241bbb12a59ad2c01aad898a4ea87366b8248,2019-02-06 15:50:00 UTC,auto,auto,707,760,710,CA,Fresno,5,False,False,False,Gas


In [49]:
# Add year and month

feb_2019["Year"] = "2019"
feb_2019["Month"] = "feb"


In [50]:
# Rename columns to label the aggregates

feb_2019 = feb_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [51]:
feb_2019_ave = feb_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2019_ave

In [52]:
# Export CSV file

feb_2019_ave.to_csv("data/day/CA/feb/feb_2019_ave.csv", header=True, index=True)

### 2020 February Day

In [53]:
# Read in month csv for state
feb_2020 = pd.read_csv("../data_large/CA-day/2020-feb-day-CA.csv")

# feb_2020

In [54]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2020.drop(feb_2020[feb_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,40c69f41f2dce00f163c56d397d17de07c6c5589,2020-02-04 19:05:00 UTC,heat,hold,648,675,675,CA,Mira Loma,18,False,False,False,Gas
1,b03219b29e9755769f31877c800a8eefde4b7a23,2020-02-11 19:15:00 UTC,heat,hold,675,671,671,CA,Santa Clarita,59,False,False,False,Gas
2,32a372cead12f97847c601326c7522895fcf62a8,2020-02-03 19:15:00 UTC,heat,auto,653,620,620,CA,Whittier,10,False,False,False,Gas
3,87e11722f5b5fd3f5eb62c047f1e01eb34c00cf4,2020-02-13 10:20:00 UTC,heat,auto,667,652,630,CA,Antioch,39,True,False,False,Gas
4,d3bcbe5aef85ad089843b9030c3f275126a13daa,2020-02-15 19:30:00 UTC,heat,hold,704,698,620,CA,Indio,9,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3222972,5b337f5b594d1b29a5fe39320d19f816214543cf,2020-02-15 18:40:00 UTC,auto,hold,687,765,695,CA,Santa Monica,45,False,False,False,Gas
3222973,8074dfd13ba7baad9c02f0ad48e3019ce2380d47,2020-02-18 17:40:00 UTC,auto,hold,713,765,725,CA,Oak View,50,False,False,False,Gas
3222974,3977ab46b62e64fa8cf51a958b1e77ceb29c3b43,2020-02-17 07:05:00 UTC,auto,hold,740,765,715,CA,Riverside,0,True,False,False,Gas
3222975,3977ab46b62e64fa8cf51a958b1e77ceb29c3b43,2020-02-13 19:35:00 UTC,auto,hold,726,765,715,CA,Riverside,0,True,False,False,Gas


In [55]:
# Add year and month

feb_2020["Year"] = "2020"
feb_2020["Month"] = "feb"


In [56]:
# Rename columns to label the aggregates

feb_2020 = feb_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [57]:
feb_2020_ave = feb_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2020_ave

In [58]:
# Export CSV file

feb_2020_ave.to_csv("data/day/CA/feb/feb_2020_ave.csv", header=True, index=True)

### 2021 February Day

In [59]:
# Read in month csv for state
feb_2021 = pd.read_csv("../data_large/CA-day/2021-feb-day-CA.csv")

# feb_2021

In [60]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
feb_2021.drop(feb_2021[feb_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

feb_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,c7bb5f1ad7f06bcd4128b1f1440a20e3ddd69d32,2021-02-10 16:45:00 UTC,heat,hold,602,655,655,CA,Los Angeles,69,False,False,False,Gas
1,718ab69fa1856550d05b5fc62be9325c812e7360,2021-02-02 16:45:00 UTC,heat,hold,675,671,671,CA,Danville,0,False,False,False,Gas
3,4563cb42796c16b0174161f9c1f6109a3017478a,2021-02-03 10:30:00 UTC,heat,hold,674,661,660,CA,Palm Springs,65,False,False,False,Gas
4,8dba3d9427cbeb3b4d1c63fb09c71b6dae280179,2021-02-14 18:25:00 UTC,heat,hold,711,709,709,CA,San Diego,5,False,False,False,Gas
6,69743e7e9076114f6a99bea87449fd5c26429415,2021-02-06 18:40:00 UTC,heat,hold,691,698,698,CA,Bakersfield,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1858147,bd88475e73281a80d01984c689fe3903d5316955,2021-02-18 19:45:00 UTC,auto,hold,731,760,720,CA,Culver City,99,False,False,False,Gas
1858148,750cb653a32d152fba154d11430118b76cdb35d2,2021-02-18 16:05:00 UTC,auto,hold,677,760,680,CA,Woodland Hills,50,False,False,False,Gas
1858149,6a6f03dff9a7d63aee9ea3666f807d1030ba51ad,2021-02-11 17:15:00 UTC,auto,hold,669,760,650,CA,Lodi,20,False,False,False,Gas
1858150,bd88475e73281a80d01984c689fe3903d5316955,2021-02-10 18:20:00 UTC,auto,hold,719,760,720,CA,Culver City,99,False,False,False,Gas


In [61]:
# Add year and month

feb_2021["Year"] = "2021"
feb_2021["Month"] = "feb"


In [62]:
# Rename columns to label the aggregates

feb_2021 = feb_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [63]:
feb_2021_ave = feb_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# feb_2021_ave

In [64]:
# Export CSV file

feb_2021_ave.to_csv("data/day/CA/feb/feb_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [65]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/feb/") if f.endswith(".csv")]

# files

In [66]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_feb = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/feb/" + file)
    CA_feb = pd.concat([CA_feb, df])
    
CA_feb

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,feb,2017,heat,auto,San Marcos,689.040000,700.120000,690.180000,15.0,False,False,False
1,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,feb,2017,heat,hold,San Marcos,684.183333,700.000000,683.000000,15.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,feb,2017,heat,auto,Dublin,679.857143,683.214286,679.357143,5.0,False,False,False
3,0050d013d4928f706c90379b28f9ab257178f1a8,feb,2017,heat,hold,Dublin,670.042254,668.042254,663.450704,5.0,False,False,False
4,00c938e5bb1d705eaabfe9e555bc0e4f5f3c9c57,feb,2017,heat,auto,Agoura Hills,622.611111,738.333333,738.333333,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3969,ffc5939f7005156793fd751db9de2b35320a97e6,feb,2021,auto,hold,Los Angeles,675.577381,790.000000,670.000000,59.0,False,False,False
3970,ffd99f8a17ac8f96739ceba7c0feca750b886605,feb,2021,auto,hold,San Diego,681.575758,790.000000,673.636364,40.0,False,False,False
3971,ffd99f8a17ac8f96739ceba7c0feca750b886605,feb,2021,heat,hold,San Diego,680.241935,682.233871,681.362903,40.0,False,False,False
3972,ffdb74abccd27a50a47d411800653c4b0e702b49,feb,2021,heat,hold,San Diego,696.980392,695.176471,695.176471,9.0,True,False,False


In [67]:
CA_feb.to_csv("Scraper_Output/State_Month_Day/CA/CA_feb.csv", header=True, index=False)

---

## March

---

## April

---

## May

---

## June

### 2017 June Day

In [68]:
# Read in month csv for state
jun_2017 = pd.read_csv("../data_large/CA-day/2017-jun-day-CA.csv")

# jun_2017

In [69]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2017.drop(jun_2017[jun_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,6cbb4f2c2f1ee09da98801ca1e190e5fe399ae49,2017-06-07 13:35:00 UTC,cool,hold,745,775,775,CA,Brea,40,True,False,False,Gas
2,4ef217faf4a785d27650bb00617f018859fd6233,2017-06-06 14:00:00 UTC,heat,hold,721,666,666,CA,San Diego,0,False,False,False,Gas
3,1c46866cbf0d919bc410f9c2161bfbcd8c338521,2017-06-22 19:40:00 UTC,auto,auto,772,779,644,CA,Vista,5,False,False,False,Gas
5,8bb0ad17fe417345b8e01b84154af36e86a481cf,2017-06-27 16:05:00 UTC,cool,auto,740,765,715,CA,Los Angeles,76,False,False,False,Gas
6,293eaf4c38fa6902779da71625e8a3ac0bde94cc,2017-06-23 14:50:00 UTC,auto,hold,699,715,665,CA,Fontana,18,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322926,b066849e2ee360dfa31219167c2c5a6ee3f8cad3,2017-06-03 15:20:00 UTC,cool,hold,741,760,760,CA,Agoura Hills,0,False,False,False,Gas
1322927,52a4d072aabe0c30b2c7fe1d131de3ff9b15de8c,2017-06-17 13:40:00 UTC,cool,hold,744,760,760,CA,Saugus,10,False,False,False,Gas
1322928,29bd6ab21be13bd3db1fcd6a2446b37824f2f315,2017-06-13 14:30:00 UTC,heat,hold,756,760,760,CA,San Diego,10,False,False,False,Gas
1322929,e3c3e9929bfb40317b5121721366f0ad54cb1079,2017-06-17 19:05:00 UTC,cool,auto,785,780,760,CA,Sacramento,30,False,False,True,Electric


In [70]:
# Add year and month

jun_2017["Year"] = "2017"
jun_2017["Month"] = "jun"

In [71]:
# Rename columns to label the aggregates

jun_2017 = jun_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [72]:
jun_2017_ave = jun_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2017_ave

In [73]:
# Export CSV file

jun_2017_ave.to_csv("data/day/CA/jun/jun_2017_ave.csv", header=True, index=True)

### 2018 June Day

In [74]:
# Read in month csv for state
jun_2018 = pd.read_csv("../data_large/CA-day/2018-jun-day-CA.csv")

# jun_2018

In [75]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2018.drop(jun_2018[jun_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,a5d98ddf3136bff745ceebb39321d02531759839,2018-06-16 17:10:00 UTC,cool,hold,706,735,735,CA,Thousand Oaks,0,False,False,True,Electric
1,67124ec4ea776e29023a7dd0a4d109780b115ab7,2018-06-04 17:05:00 UTC,auto,hold,746,741,658,CA,Roseville,15,False,False,False,Gas
4,1208970bba37ef271ec94f96f46f1b0382932dc7,2018-06-12 18:45:00 UTC,cool,hold,741,735,735,CA,Palm Springs,10,False,False,True,Electric
5,9fda4eb2a0c987623325430f8f966a62c5ff4eda,2018-06-28 15:35:00 UTC,auto,hold,744,782,706,CA,Lake Balboa,0,False,False,False,Gas
7,89e3c6f7e628421bc121d744e80cb1b56d945e1e,2018-06-04 09:45:00 UTC,cool,hold,739,745,745,CA,Walnut Creek,5,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2438044,fcef37fd6a41a2b037247607e22c71c60a59d51d,2018-06-21 17:45:00 UTC,cool,hold,755,760,760,CA,San Diego,99,True,False,True,Electric
2438045,f37004bb80247e8b11a73cc7aceb2f4139bcb0bd,2018-06-11 07:15:00 UTC,cool,hold,753,760,760,CA,Heber,70,True,False,True,Electric
2438046,7ddeac4dc0b344a5da8d97af0cd141e1a07c9f08,2018-06-28 19:55:00 UTC,cool,auto,760,760,760,CA,Mission Viejo,50,False,False,False,Gas
2438047,e75e089250f4053dc430c0f648205afd30b4825b,2018-06-05 14:55:00 UTC,cool,hold,742,760,760,CA,Antelope,0,False,False,False,Gas


In [76]:
# Add year and month

jun_2018["Year"] = "2018"
jun_2018["Month"] = "jun"

In [77]:
# Rename columns to label the aggregates

jun_2018 = jun_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [78]:
jun_2018_ave = jun_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2018_ave

In [79]:
# Export CSV file

jun_2018_ave.to_csv("data/day/CA/jun/jun_2018_ave.csv", header=True, index=True)

### 2019 June Day

In [80]:
# Read in month csv for state
jun_2019 = pd.read_csv("../data_large/CA-day/2019-jun-day-CA.csv")

# jun_2019

In [81]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2019.drop(jun_2019[jun_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,a4ad92ebba0ed6bbf2f1190fb01c90344a08e567,2019-06-15 17:25:00 UTC,heat,hold,700,698,698,CA,Los Angeles,0,False,False,False,Gas
1,2e015752baeb7e61661d698ea8133c7bd107ebce,2019-06-07 19:25:00 UTC,cool,auto,747,760,693,CA,Stevenson Ranch,20,False,False,False,Gas
3,88b77b9476ebae4afbd2b9caf2a76c4ad0f66302,2019-06-25 09:15:00 UTC,cool,hold,703,730,729,CA,Los Angeles,70,False,False,False,Gas
4,14270cce4ce35da09e7310364f6daabd1e086b01,2019-06-03 15:05:00 UTC,cool,hold,730,755,755,CA,Chico,0,True,False,False,Gas
5,88b77b9476ebae4afbd2b9caf2a76c4ad0f66302,2019-06-09 11:35:00 UTC,cool,hold,715,730,729,CA,Los Angeles,70,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3263953,f143deaf6b89afb52482d7752e61b71685dd13be,2019-06-29 18:15:00 UTC,cool,hold,762,760,760,CA,Lemoore,20,False,False,False,Gas
3263954,7a22f937c654747269defa1501c2096963f829e5,2019-06-18 15:35:00 UTC,cool,hold,739,760,760,CA,Rocklin,0,False,False,False,Gas
3263955,1291e67e78515d80816b957e0de5e0d46bb74c3e,2019-06-24 13:50:00 UTC,cool,hold,700,760,760,CA,Anaheim,35,False,False,False,Gas
3263956,cff6c988de403bff7bc66d893c4d038b003ffa92,2019-06-25 16:30:00 UTC,cool,hold,730,760,760,CA,Roseville,10,False,False,False,Gas


In [82]:
# Add year and month

jun_2019["Year"] = "2019"
jun_2019["Month"] = "jun"

In [83]:
# Rename columns to label the aggregates

jun_2019 = jun_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [84]:
jun_2019_ave = jun_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2019_ave

In [85]:
# Export CSV file

jun_2019_ave.to_csv("data/day/CA/jun/jun_2019_ave.csv", header=True, index=True)

### 2020 June Day

In [86]:
# Read in month csv for state
jun_2020 = pd.read_csv("../data_large/CA-day/2020-jun-day-CA.csv")

# jun_2020

In [87]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2020.drop(jun_2020[jun_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,e8250afa64383b39aa42c3d781e822eda42a4845,2020-06-01 19:20:00 UTC,cool,auto,758,753,753,CA,Long Beach,20,False,False,True,Electric
1,f6bd1f5e48aa66006d158b79b46984bc8dec897d,2020-06-04 15:05:00 UTC,auto,hold,756,785,675,CA,Rancho San Diego,35,True,False,False,Gas
2,637117fd1ebaf2cb3f63be9f93d3b37ac0e23199,2020-06-27 18:15:00 UTC,cool,hold,786,790,789,CA,Anderson,80,False,False,False,Gas
3,8bcc8b08511efc2c5f2b5b439ff115815b6bed78,2020-06-07 13:45:00 UTC,cool,hold,708,741,741,CA,Rancho Cucamonga,0,False,False,False,Gas
4,35248f8b458b09809b494ae65ba833d26fb2e9c8,2020-06-20 19:45:00 UTC,auto,hold,705,705,655,CA,San Jose,65,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3224431,13929914f8a4ba75963f8c3889a5ab59ac63e13e,2020-06-20 15:15:00 UTC,cool,auto,764,760,760,CA,Menlo Park,80,False,False,False,Gas
3224432,e58db4275b4233ac211ed7abf76a8a3b5d78766b,2020-06-10 18:55:00 UTC,cool,auto,778,780,760,CA,Yuba City,25,False,False,False,Gas
3224433,4ebc9479e2c040cb47b76a422e85b594ca7500b5,2020-06-20 18:05:00 UTC,cool,auto,754,760,760,CA,Hayward,20,False,False,False,Gas
3224434,2026d26d7b77becbcb042838059b1f11d2e77557,2020-06-18 18:10:00 UTC,cool,hold,757,760,760,CA,Sacramento,45,False,False,False,Gas


In [88]:
# Add year and month

jun_2020["Year"] = "2020"
jun_2020["Month"] = "jun"

In [89]:
# Rename columns to label the aggregates

jun_2020 = jun_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [90]:
jun_2020_ave = jun_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2020_ave

In [91]:
# Export CSV file

jun_2020_ave.to_csv("data/day/CA/jun/jun_2020_ave.csv", header=True, index=True)

### 2021 June Day

In [92]:
# Read in month csv for state
jun_2021 = pd.read_csv("../data_large/CA-day/2021-jun-day-CA.csv")

# jun_2021

In [93]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)
jun_2021.drop(jun_2021[jun_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jun_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,9b3495c8be0fca2bab65c60fc41f054b3008eb4a,2021-06-17 13:50:00 UTC,auto,hold,705,705,655,CA,Dublin,10,True,False,False,Gas
1,2b095a65ec9140c90d2af9e64d263d90254445dd,2021-06-28 18:50:00 UTC,cool,hold,782,775,775,CA,Fontana,15,False,False,False,Gas
2,b8e7cec636f757c438fd25dba2519fa34ee08507,2021-06-06 19:35:00 UTC,cool,hold,710,711,711,CA,West Covina,60,True,False,False,Gas
3,0e53844e005bc3dedaad437853465a621cd48913,2021-06-13 14:40:00 UTC,cool,hold,718,716,716,CA,Livermore,15,False,False,False,Gas
4,b1def17aff6f0ffd474c8d9cd7dd2e37a056552c,2021-06-10 16:20:00 UTC,auto,hold,706,766,686,CA,San Marcos,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855608,b8158ac89ff4e7b75de01d45959288468be3abfc,2021-06-01 18:50:00 UTC,cool,hold,762,760,760,CA,Folsom,10,False,False,False,Gas
1855609,08640f9673c772015306ef472ec7ae9229644af2,2021-06-26 18:30:00 UTC,cool,hold,759,760,760,CA,Colton,0,False,False,False,Gas
1855610,f47392e1ec4f1856847edfb736a390b300c6c8c8,2021-06-11 17:30:00 UTC,cool,hold,664,760,760,CA,Hidden Valley Lake,18,False,False,False,Gas
1855611,ac3027ac5140047cc87d0acc12f028922342043d,2021-06-12 18:45:00 UTC,cool,hold,761,760,760,CA,Lancaster,0,True,False,False,Gas


In [94]:
# Add year and month

jun_2021["Year"] = "2021"
jun_2021["Month"] = "jun"

In [95]:
# Rename columns to label the aggregates

jun_2021 = jun_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [96]:
jun_2021_ave = jun_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jun_2021_ave

In [97]:
# Export CSV file

jun_2021_ave.to_csv("data/day/CA/jun/jun_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [98]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/jun/") if f.endswith(".csv")]

# files

In [99]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_jun = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/jun/" + file)
    CA_jun = pd.concat([CA_jun, df])
    
CA_jun

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,0029564391031ecb03549e6389960db4eed71126,jun,2017,auto,hold,San Gabriel,738.944444,742.777778,677.592593,90.0,False,False,False
1,0050d013d4928f706c90379b28f9ab257178f1a8,jun,2017,cool,auto,Dublin,728.240093,818.386946,620.079254,5.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,jun,2017,cool,hold,Dublin,765.800000,772.040000,765.120000,5.0,False,False,False
3,007f930abce4596b2214cd79a09714b3d482e17d,jun,2017,cool,auto,Escondido,725.000000,719.750000,630.000000,0.0,False,False,False
4,00c938e5bb1d705eaabfe9e555bc0e4f5f3c9c57,jun,2017,cool,auto,Agoura Hills,751.033333,756.000000,746.000000,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3864,ff573f53253c7fcae2af9ffaae465435f5e773ca,jun,2021,cool,hold,Los Angeles,744.945736,745.000000,745.000000,60.0,False,False,False
3865,ff7fa20dc4a160fba797e8a11b8b47ccc8494686,jun,2021,cool,hold,San Diego,770.000000,705.000000,720.000000,55.0,False,False,False
3866,ffab10c348bdc935f2b26c06b99e75d237f5cead,jun,2021,auto,hold,Visalia,790.095440,805.426892,651.884344,97.0,False,False,False
3867,ffd99f8a17ac8f96739ceba7c0feca750b886605,jun,2021,auto,hold,San Diego,742.539216,743.565359,668.496732,40.0,False,False,False


In [100]:
CA_jun.to_csv("Scraper_Output/State_Month_Day/CA/CA_jun.csv", header=True, index=False)

---

## July

### 2017 July Day

In [101]:
# Read in month csv for state
jul_2017 = pd.read_csv("../data_large/CA-day/2017-jul-day-CA.csv")

# jul_2017

In [102]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2017.drop(jul_2017[jul_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,a46b8fedcf16efcb76ff3e0ffa26d8b1db13cdbd,2017-07-28 18:55:00 UTC,cool,hold,775,784,784,CA,Riverside,10,False,False,False,Gas
2,eced049c3ce124651ad4ceb706b09b037c9125f7,2017-07-15 18:10:00 UTC,cool,hold,727,745,745,CA,San Jose,60,True,False,True,Electric
3,52eb435c0d0d7a65c3d13b57188baa80e2858480,2017-07-22 17:15:00 UTC,auto,auto,734,732,642,CA,San JOSE,5,False,False,False,Gas
5,bc846390af3f7211ceb3b5c654931e10e327147a,2017-07-31 15:30:00 UTC,cool,hold,684,728,728,CA,woodland hills,0,False,False,False,Gas
6,adab8f6653f522e2aff41f617f478f80551488d5,2017-07-25 15:25:00 UTC,cool,hold,746,755,755,CA,San Diego,65,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1786336,1291e67e78515d80816b957e0de5e0d46bb74c3e,2017-07-30 15:25:00 UTC,cool,hold,720,760,760,CA,Anaheim,35,False,False,False,Gas
1786337,b91d14a292a599d46358b30b745a96322f2ccde4,2017-07-22 15:40:00 UTC,cool,hold,741,760,760,CA,El Cajon,25,False,False,False,Gas
1786338,ba34f8eae46ce39714bafe16e5f719c48c709410,2017-07-15 10:10:00 UTC,cool,auto,780,780,760,CA,Roseville,10,False,False,False,Gas
1786339,a7f299e9f68704feb6d830b4c6070a62dac6ea0d,2017-07-15 18:00:00 UTC,cool,hold,756,760,760,CA,Coto de Caza,0,False,False,False,Gas


In [103]:
# Add year and month

jul_2017["Year"] = "2017"
jul_2017["Month"] = "jul"

In [104]:
# Rename columns to label the aggregates

jul_2017 = jul_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [105]:
jul_2017_ave = jul_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2017_ave

In [106]:
# Export CSV file

jul_2017_ave.to_csv("data/day/CA/jul/jul_2017_ave.csv", header=True, index=True)

### 2018 July Day

In [107]:
# Read in month csv for state
jul_2018 = pd.read_csv("../data_large/CA-day/2018-jul-day-CA.csv")

# jul_2018

In [108]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2018.drop(jul_2018[jul_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,ea4b3943a6c8d5421f9830b5eefff2a34dad4884,2018-07-02 13:35:00 UTC,cool,hold,723,759,759,CA,Sloughhouse,20,True,False,True,Electric
1,1dfa046705af333fa4c351bb8d010ef4d633d280,2018-07-23 17:30:00 UTC,cool,auto,786,820,820,CA,Van Nuys,35,False,False,False,Gas
3,55d9dbd4644b6d863e6b42ad279db4a7886a9e98,2018-07-07 15:25:00 UTC,cool,hold,737,745,745,CA,El Cajon,0,False,False,False,Gas
4,fcc29500abfb066ccb385f45ce911c169cb61924,2018-07-05 18:45:00 UTC,auto,hold,753,750,695,CA,Chino Hills,30,False,False,False,Gas
6,d64ad1a6d81f24e7cc037e14a5fe308efd4f5962,2018-07-15 17:00:00 UTC,cool,auto,801,820,820,CA,Ladera Ranch,0,False,False,True,Electric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3386796,9c2feee7be1bea5a783b6c58f9e91696a33a99bd,2018-07-27 15:35:00 UTC,cool,hold,756,760,760,CA,San Marcos,35,False,False,True,Electric
3386797,8b8f3387fa3e1bcb73be0c86aeb860c24b5c5fae,2018-07-07 16:10:00 UTC,cool,hold,756,760,760,CA,San Diego,0,False,False,False,Gas
3386798,faede8898bb6c8e5b167623674db5cebf437c2ee,2018-07-31 15:50:00 UTC,cool,hold,691,760,760,CA,Nevada City,25,True,True,True,Electric
3386799,2272c9d9cb7dd7999ec7d3697da47d7c8c56c7c7,2018-07-12 17:15:00 UTC,cool,hold,763,760,760,CA,Elk grove,0,False,False,False,Gas


In [109]:
# Add year and month

jul_2018["Year"] = "2018"
jul_2018["Month"] = "jul"

In [110]:
# Rename columns to label the aggregates

jul_2018 = jul_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [111]:
jul_2018_ave = jul_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2018_ave

In [112]:
# Export CSV file

jul_2018_ave.to_csv("data/day/CA/jul/jul_2018_ave.csv", header=True, index=True)

### 2019 July Day

In [113]:
# Read in month csv for state
jul_2019 = pd.read_csv("../data_large/CA-day/2019-jul-day-CA.csv")

# jul_2019

In [114]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2019.drop(jul_2019[jul_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,b1334e0074a76b39aa6a0456bac7dfb61c2d1485,2019-07-20 08:40:00 UTC,auto,auto,745,765,690,CA,Ridgecrest,35,False,False,False,Gas
1,671e52105130291641ae005b8c91d0c3837a724c,2019-07-12 17:25:00 UTC,heat,hold,752,741,741,CA,San Diego,10,False,False,False,Gas
2,35248f8b458b09809b494ae65ba833d26fb2e9c8,2019-07-30 19:30:00 UTC,auto,auto,701,700,650,CA,San Jose,65,False,False,False,Gas
3,8df1e8bc55591180719661f0ee0aa8967e48a399,2019-07-05 15:15:00 UTC,auto,hold,721,740,690,CA,Pleasanton,20,False,False,False,Gas
4,a0ae60648ffc1a87fb59b5e4daf873d52155a4a9,2019-07-11 19:05:00 UTC,auto,hold,781,770,720,CA,Sacramento,20,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3875152,fcf0da5e5bdcd800da86c10cd26a269d3c7889a0,2019-07-08 13:40:00 UTC,auto,auto,756,780,700,CA,Murrieta,10,False,False,False,Gas
3875153,9dc69e3a9088d0475fcc95d6ced4ef50b1c6e8c6,2019-07-11 14:45:00 UTC,auto,hold,755,750,610,CA,Cathedral City,50,False,False,False,Gas
3875154,8914ddfaab5576f43b244357fd63a0853e9f41c5,2019-07-24 18:10:00 UTC,cool,hold,795,790,790,CA,Modesto,29,True,False,False,Gas
3875155,92b007ed383b01e2dacdd342acafa3265c93973f,2019-07-12 18:05:00 UTC,cool,hold,725,720,720,CA,Long Beach,60,True,False,False,Gas


In [115]:
# Add year and month

jul_2019["Year"] = "2019"
jul_2019["Month"] = "jul"

In [116]:
# Rename columns to label the aggregates

jul_2019 = jul_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [117]:
jul_2019_ave = jul_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2019_ave

In [118]:
# Export CSV file

jul_2019_ave.to_csv("data/day/CA/jul/jul_2019_ave.csv", header=True, index=True)

### 2020 July Day

In [119]:
# Read in month csv for state
jul_2020 = pd.read_csv("../data_large/CA-day/2020-jul-day-CA.csv")

# jul_2020

In [120]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
jul_2020.drop(jul_2020[jul_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

jul_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,6ccd6f47a0500057183c2c82d9ecf4c98e2f2794,2020-07-30 17:05:00 UTC,cool,hold,734,739,739,CA,Roseville,0,True,False,False,Gas
3,f47392e1ec4f1856847edfb736a390b300c6c8c8,2020-07-25 17:35:00 UTC,cool,hold,708,800,800,CA,Hidden Valley Lake,18,False,False,False,Gas
4,dbea08bd2c5ad1ed52e7e8045f9a9b3b485a60dd,2020-07-27 15:40:00 UTC,cool,hold,752,790,756,CA,Chino,15,False,False,False,Gas
5,2f6f52a4745268564b849a66968a72c778521d00,2020-07-03 08:45:00 UTC,auto,auto,759,750,693,CA,Gilroy,5,False,False,False,Gas
6,905d31b2c38d52d5b2eb08779b9a6662c798cc6a,2020-07-04 16:55:00 UTC,cool,hold,692,689,689,CA,Mar Vista,70,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3553544,68842419dc8f809136fe38505991692e253103f0,2020-07-14 17:15:00 UTC,cool,hold,758,760,760,CA,Los Angeles,0,False,False,False,Gas
3553545,4618577be57ba60c51bfb10fee4730808bba9f9a,2020-07-28 14:05:00 UTC,cool,auto,700,760,760,CA,Roseville,0,False,False,False,Gas
3553546,67f15ab1141305e471c6aae822fdc2eb0234a954,2020-07-31 08:15:00 UTC,cool,hold,759,760,760,CA,Upland,60,True,False,False,Gas
3553547,a86f82aecbc7df318c6b34d69d9354c631bf5e4d,2020-07-21 13:55:00 UTC,cool,hold,749,760,760,CA,Elk Grove,6,False,False,False,Gas


In [121]:
# Add year and month

jul_2020["Year"] = "2020"
jul_2020["Month"] = "jul"

In [122]:
# Rename columns to label the aggregates

jul_2020 = jul_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [123]:
jul_2020_ave = jul_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2020_ave

In [124]:
# Export CSV file

jul_2020_ave.to_csv("data/day/CA/jul/jul_2020_ave.csv", header=True, index=True)

### 2021 July Day

In [125]:
# Read in month csv for state
jul_2021 = pd.read_csv("../data_large/CA-day/2021-jul-day-CA.csv")

# jul_2021

In [126]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedCool'] >= 850].index, inplace = True)
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedHeat'] <= 600].index, inplace = True)
jul_2021.drop(jul_2021[jul_2021['TemperatureExpectedCool'] <= 600].index, inplace = True)

jul_2021

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
1,778fcba811e1f33ad89e1c5c5b1bb8787bb554cf,2021-07-10 14:05:00 UTC,cool,hold,724,719,719,CA,Goleta,10,False,False,False,Gas
2,89a8ce25f0e734f52af37307a80a9a55da50d9eb,2021-07-08 19:15:00 UTC,cool,hold,753,735,735,CA,Corona,0,True,False,True,Electric
3,132bf99e9c28d3477d8a446c92307053ba149720,2021-07-09 17:05:00 UTC,auto,hold,750,747,697,CA,Monte Sereno,30,False,False,False,Gas
4,38a50e8a7b7a307f079b20372a4757981f830be7,2021-07-27 17:50:00 UTC,auto,hold,764,761,716,CA,San Diego,20,False,False,False,Gas
5,70738886c52a685e8ca6e1a157e28fc5a5eb4170,2021-07-18 18:10:00 UTC,cool,hold,740,761,761,CA,Los Angeles,80,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970420,193fd854ff4842a0b8c15011a385b45ce9e185a6,2021-07-12 08:00:00 UTC,cool,hold,788,790,760,CA,Indian Wells,0,False,False,False,Gas
1970421,2031094b0495a41228872a3a2546605ba1de60aa,2021-07-15 15:10:00 UTC,cool,hold,759,760,760,CA,Palm Springs,35,False,False,True,Electric
1970422,a86f82aecbc7df318c6b34d69d9354c631bf5e4d,2021-07-29 16:00:00 UTC,cool,hold,762,760,760,CA,Elk Grove,6,False,False,False,Gas
1970423,ea6d6dd1b9817a6f71ace911118ec53420958519,2021-07-15 19:40:00 UTC,cool,hold,753,760,760,CA,Bay Point,20,False,False,False,Gas


In [127]:
# Add year and month

jul_2021["Year"] = "2021"
jul_2021["Month"] = "jul"

In [128]:
# Rename columns to label the aggregates

jul_2021 = jul_2021.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [129]:
jul_2021_ave = jul_2021.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# jul_2021_ave

In [130]:
# Export CSV file

jul_2021_ave.to_csv("data/day/CA/jul/jul_2021_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [131]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/jul/") if f.endswith(".csv")]

# files

In [132]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_jul = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/jul/" + file)
    CA_jul = pd.concat([CA_jul, df])
    
CA_jul

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,jul,2017,cool,hold,San Marcos,765.906250,767.125000,749.708333,15.0,False,False,False
1,0050d013d4928f706c90379b28f9ab257178f1a8,jul,2017,cool,auto,Dublin,743.383117,820.000000,620.000000,5.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,jul,2017,heat,auto,Dublin,759.368421,820.000000,620.000000,5.0,False,False,False
3,0075eea4247684839bc4386d5fc5d3abd60f8290,jul,2017,auto,hold,Irvine,786.527778,809.777778,759.500000,45.0,False,False,False
4,0075eea4247684839bc4386d5fc5d3abd60f8290,jul,2017,cool,hold,Irvine,768.666667,770.000000,770.000000,45.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,ffab10c348bdc935f2b26c06b99e75d237f5cead,jul,2021,auto,hold,Visalia,800.151327,799.575664,650.000000,97.0,False,False,False
3825,ffd99f8a17ac8f96739ceba7c0feca750b886605,jul,2021,auto,hold,San Diego,747.447826,743.400000,670.000000,40.0,False,False,False
3826,ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,jul,2021,cool,hold,Chico,778.573770,769.508197,769.508197,0.0,False,False,False
3827,fff157010dde22d7786354c4f9c5a2ea80365f5c,jul,2021,auto,hold,Ladera Ranch,744.630000,745.700000,679.580000,0.0,False,False,False


In [133]:
CA_jul.to_csv("Scraper_Output/State_Month_Day/CA/CA_jul.csv", header=True, index=False)

---

## August

### 2017 August Day

In [134]:
# Read in month csv for state
aug_2017 = pd.read_csv("../data_large/CA-day/2017-aug-day-CA.csv")

# aug_2017

In [135]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2017.drop(aug_2017[aug_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,261deb1f1eff4db073c2836b3ed0a792ff7f4453,2017-08-14 13:35:00 UTC,auto,auto,689,735,685,CA,Moraga,30,False,False,False,Gas
1,2ba40883d164b4fc566a9537bb9f5fdb8c2fd487,2017-08-18 19:15:00 UTC,cool,hold,785,779,779,CA,Arcadia,10,False,False,False,Gas
2,da223bac3b54b2d42a86a071d02ad0ec00eb7524,2017-08-27 16:50:00 UTC,cool,hold,749,745,745,CA,Manteca,5,False,False,False,Gas
3,c3c3850cce3be4ad6a4f186557d505f6f1289787,2017-08-28 16:30:00 UTC,auto,auto,779,815,765,CA,Temecula,5,False,False,False,Gas
4,21915ffac3d348d0c941089fdca2deda7e5dcec1,2017-08-29 14:15:00 UTC,cool,hold,831,848,800,CA,Sun Valley,65,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826432,0722d8c9c144b0040b5b3e2382cf41a73fd0630e,2017-08-05 07:25:00 UTC,cool,hold,757,760,760,CA,Glendale,45,False,False,False,Gas
1826433,0714ef54555efcd3e2ac9bfb45c944c6773b4e6f,2017-08-26 16:00:00 UTC,cool,auto,758,760,760,CA,Costa Mesa,7,False,False,False,Gas
1826434,16a843e8da734da4977b6b8877509753bb034ed8,2017-08-22 14:10:00 UTC,cool,auto,748,760,760,CA,Cameron Park,15,False,False,False,Gas
1826435,b003bf5de0b968be6d40700ad35adf9c3a7af8a9,2017-08-15 19:55:00 UTC,cool,hold,763,760,760,CA,West Hills,0,False,False,False,Gas


In [136]:
# Add year and month

aug_2017["Year"] = "2017"
aug_2017["Month"] = "aug"

In [137]:
# Rename columns to label the aggregates

aug_2017 = aug_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [138]:
aug_2017_ave = aug_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2017_ave

In [139]:
# Export CSV file

aug_2017_ave.to_csv("data/day/CA/aug/aug_2017_ave.csv", header=True, index=True)

### 2018 August Day

In [140]:
# Read in month csv for state
aug_2018 = pd.read_csv("../data_large/CA-day/2018-aug-day-CA.csv")

# aug_2018

In [141]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2018.drop(aug_2018[aug_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,b3027031f88149232d9a76428366979ce908a629,2018-08-01 19:25:00 UTC,cool,auto,806,800,800,CA,Covina,70,False,False,False,Gas
6,f6f226ea717f11f60a212f827ee5fe14fd8145a1,2018-08-09 16:40:00 UTC,auto,hold,732,735,635,CA,Rancho Cucamonga,35,True,False,False,Gas
7,f6f675e902155f06ea6592b7bd1d8c3bfd8b59db,2018-08-18 14:10:00 UTC,cool,hold,761,820,820,CA,La Quinta,40,False,False,True,Electric
8,4a568b720dc43993e1cafb72967998a3b553c42e,2018-08-03 08:55:00 UTC,auto,hold,799,800,676,CA,Cathedral City,10,False,False,False,Gas
10,0a6bf062fa565952ca550efddc0a74de9157d4b3,2018-08-09 19:05:00 UTC,cool,hold,711,715,715,CA,Newport Beach,30,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3346197,2af3322340c7d55b330a47332f8260980b98b5e7,2018-08-10 17:40:00 UTC,cool,hold,762,760,760,CA,Sacramento,0,False,False,False,Gas
3346198,6d80077029f0b5e99360765d53d33b160d0c9717,2018-08-31 14:55:00 UTC,cool,hold,756,760,760,CA,Beverly Hills,10,False,False,False,Gas
3346199,941dfff99a98b03c212f1cfef91289425ca7d59f,2018-08-15 18:30:00 UTC,cool,hold,763,760,760,CA,Tracy,10,False,False,False,Gas
3346200,b5d54e8251d790a84bd551fa3478ae49c4964da3,2018-08-09 10:30:00 UTC,cool,auto,757,760,760,CA,Carlsbad,15,False,False,False,Gas


In [142]:
# Add year and month

aug_2018["Year"] = "2018"
aug_2018["Month"] = "aug"

In [143]:
# Rename columns to label the aggregates

aug_2018 = aug_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [144]:
aug_2018_ave = aug_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2018_ave

In [145]:
# Export CSV file

aug_2018_ave.to_csv("data/day/CA/aug/aug_2018_ave.csv", header=True, index=True)

### 2019 August Day

In [146]:
# Read in month csv for state
aug_2019 = pd.read_csv("../data_large/CA-day/2019-aug-day-CA.csv")

# aug_2019

In [147]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2019.drop(aug_2019[aug_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,67cd0477df7457cfbb97b2eaa8ad7948ce55e7e7,2019-08-22 17:15:00 UTC,cool,auto,741,750,650,CA,Granada Hills,67,False,False,False,Gas
1,158f7bb03429276bb9934ec8f1ca307f200f7c74,2019-08-09 18:05:00 UTC,cool,auto,738,750,750,CA,Tehachapi,0,False,False,False,Gas
2,2cd8693e7f93b24d84ab357287ec2497cebb25f0,2019-08-02 18:25:00 UTC,cool,hold,725,788,788,CA,San Diego,19,False,False,False,Gas
3,5a6d728af54753e4d4f588b111cff297cbc5daab,2019-08-05 13:35:00 UTC,auto,hold,736,730,630,CA,Vacaville,0,False,False,False,Gas
4,a606cd4ea5cb7ca92bc18d3f0d8e68a24701d2b5,2019-08-11 15:20:00 UTC,auto,auto,779,775,725,CA,West Fresno,10,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4057276,417d4a328e25307f60ae380e567a8871bbaae63a,2019-08-24 13:30:00 UTC,cool,auto,749,780,780,CA,Santa Clarita,0,True,False,False,Gas
4057277,60766a3cd755dec0e4e06f68746e97397ac095bb,2019-08-06 16:35:00 UTC,auto,hold,736,740,680,CA,Castro Valley,0,False,False,False,Gas
4057278,46ebd366dacbfd1117396d081782da3089f86b2e,2019-08-09 14:25:00 UTC,cool,hold,747,770,770,CA,Placerville,9,False,False,False,Gas
4057279,a72b0a2f58f17da6396a355e3623c7188e7efe6e,2019-08-17 17:50:00 UTC,cool,hold,795,810,790,CA,Sacramento,20,False,False,False,Gas


In [148]:
# Add year and month

aug_2019["Year"] = "2019"
aug_2019["Month"] = "aug"

In [149]:
# Rename columns to label the aggregates

aug_2019 = aug_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [150]:
aug_2019_ave = aug_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2019_ave

In [151]:
# Export CSV file

aug_2019_ave.to_csv("data/day/CA/aug/aug_2019_ave.csv", header=True, index=True)

### 2020 August Day

In [152]:
# Read in month csv for state
aug_2020 = pd.read_csv("../data_large/CA-day/2020-aug-day-CA.csv")

# aug_2020

In [153]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
aug_2020.drop(aug_2020[aug_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

aug_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,02eee8c04b08011d285cb3d43fb9f5ece816b5aa,2020-08-15 16:50:00 UTC,cool,auto,763,760,760,CA,Santa Clara,10,True,False,False,Gas
1,924d8b3864bf8c7b90a5f0bb82af869c0a7bb8fc,2020-08-06 13:50:00 UTC,cool,auto,701,710,710,CA,Los Angeles,0,False,False,False,Gas
2,a89451f427fcf4f00508181316fba4de193cee55,2020-08-09 19:10:00 UTC,cool,hold,795,786,786,CA,Rialto,0,False,False,False,Gas
3,9c0e4a3f2f80c476deff3b29443d727ae641c5c5,2020-08-24 12:25:00 UTC,cool,hold,788,800,790,CA,Rancho Cordova,50,False,False,False,Gas
4,e37ae787c4b5c17c8df8fdf8dc3f1ff33b9a7653,2020-08-26 09:10:00 UTC,auto,hold,758,760,670,CA,Murrieta,0,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4153194,2b252e214122f7c04c73e6a04426ab31d5d6f4da,2020-08-16 16:20:00 UTC,cool,auto,778,800,685,CA,Antioch,29,True,False,False,Gas
4153195,af884e89317371e399086d52afa00252b535296e,2020-08-25 18:05:00 UTC,auto,hold,747,750,670,CA,Aliso Viejo,20,False,False,False,Gas
4153196,f7de5452501725fcee433b4c15ac4fd8efd15f60,2020-08-01 17:25:00 UTC,auto,hold,774,770,655,CA,Elk Grove,50,False,False,False,Gas
4153197,3ae161afd0cae357589532af4b1c92154e5fc367,2020-08-31 15:45:00 UTC,cool,hold,729,760,760,CA,san jose,40,False,False,False,Gas


In [154]:
# Add year and month

aug_2020["Year"] = "2020"
aug_2020["Month"] = "aug"

In [155]:
# Rename columns to label the aggregates

aug_2020 = aug_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [156]:
aug_2020_ave = aug_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# aug_2020_ave

In [157]:
# Export CSV file

aug_2020_ave.to_csv("data/day/CA/aug/aug_2020_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [158]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/aug/") if f.endswith(".csv")]

# files

In [159]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_aug = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/aug/" + file)
    CA_aug = pd.concat([CA_aug, df])
    
CA_aug

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,aug,2017,cool,auto,San Marcos,752.976190,752.535714,680.000000,15.0,False,False,False
1,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,aug,2017,cool,hold,San Marcos,763.400000,770.000000,750.000000,15.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,aug,2017,cool,auto,Dublin,768.066225,818.357616,626.370861,5.0,False,False,False
3,0050d013d4928f706c90379b28f9ab257178f1a8,aug,2017,cool,hold,Dublin,766.625000,770.000000,770.000000,5.0,False,False,False
4,007f930abce4596b2214cd79a09714b3d482e17d,aug,2017,cool,auto,Escondido,765.244681,770.329787,630.000000,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,aug,2020,cool,auto,Chico,744.500000,780.000000,780.000000,0.0,False,False,False
8412,fff157010dde22d7786354c4f9c5a2ea80365f5c,aug,2020,cool,auto,Ladera Ranch,721.857143,711.035714,711.035714,0.0,False,False,False
8413,fff157010dde22d7786354c4f9c5a2ea80365f5c,aug,2020,cool,hold,Ladera Ranch,745.227528,753.174157,753.174157,0.0,False,False,False
8414,fff23bd2396b4238a5158f33d6afbae7abc08f57,aug,2020,cool,hold,Fremont,779.619048,800.000000,790.000000,50.0,False,False,False


In [160]:
CA_aug.to_csv("Scraper_Output/State_Month_Day/CA/CA_aug.csv", header=True, index=False)

---

## September

---

## October

---

## November

---

## December

### 2017 December Day

In [161]:
# Read in month csv for state
dec_2017 = pd.read_csv("../data_large/CA-day/2017-dec-day-CA.csv")

# dec_2017

In [162]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2017.drop(dec_2017[dec_2017['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2017

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,1f935ea229fffb62d5f0f707d5f1c97181d29a15,2017-12-28 18:05:00 UTC,heat,hold,740,716,716,CA,Foothill Ranch,25,False,False,False,Gas
1,bd0351b5a243597d2a8f3349cd3718ba1f5dab70,2017-12-29 15:35:00 UTC,heat,hold,634,610,610,CA,Placerville,25,False,False,True,Electric
2,021f13c968b06a6ab5c6a8d90793b67b2775309d,2017-12-30 15:35:00 UTC,auto,hold,699,795,705,CA,Fresno,40,False,False,False,Gas
3,0177dc89f1763f47b10f0e9e9f04809bf7b5ecf1,2017-12-29 19:20:00 UTC,auto,auto,602,779,650,CA,Elk Grove,20,True,False,True,Electric
4,c79868d7689249189c7eda1d93c5cd2e133d33fb,2017-12-09 15:30:00 UTC,heat,hold,661,671,671,CA,Clayton,47,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237034,272addbaf3c751c1a2287a81c6dd9290ecf76ced,2017-12-19 18:10:00 UTC,auto,hold,658,765,645,CA,Escondido,50,False,False,False,Gas
2237035,bd6f0d30e1d390b33cbcd07945e5dd79f8f4a039,2017-12-03 19:15:00 UTC,auto,auto,712,765,715,CA,San Jose,0,False,False,False,Gas
2237036,c8662c6332143ec1eedf1bd295faa1729a672406,2017-12-26 18:10:00 UTC,auto,hold,700,765,705,CA,San Ramon,47,False,False,False,Gas
2237037,427c7a5ec646c0c2e156fbdc0c3e116c56aa08fe,2017-12-24 19:55:00 UTC,auto,hold,741,765,715,CA,San Diego,0,False,False,True,Electric


In [163]:
# Add year and month

dec_2017["Year"] = "2017"
dec_2017["Month"] = "dec"

In [164]:
# Rename columns to label the aggregates

dec_2017 = dec_2017.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [165]:
dec_2017_ave = dec_2017.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2017_ave

In [166]:
# Export CSV file

dec_2017_ave.to_csv("data/day/CA/dec/dec_2017_ave.csv", header=True, index=True)

### 2018 December Day

In [167]:
# Read in month csv for state
dec_2018 = pd.read_csv("../data_large/CA-day/2018-dec-day-CA.csv")

# dec_2018

In [168]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2018.drop(dec_2018[dec_2018['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2018

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,547e99b6547579d8db8457843e21e6a18f3e0db8,2018-12-09 15:05:00 UTC,auto,hold,671,755,675,CA,Calabasas,30,False,False,False,Gas
1,2879e3f7a633cc8222b258d49a967cc2b8c65f08,2018-12-10 14:25:00 UTC,auto,hold,662,742,692,CA,Riverside,40,False,False,False,Gas
3,d496b5a06636e02afc85acca35148b02f79b8a19,2018-12-14 18:35:00 UTC,heat,auto,674,724,650,CA,Los Altos,50,True,False,False,Gas
4,84d0ff8f813e470d52e2f9edcc7d29b002d5c27b,2018-12-16 16:20:00 UTC,heat,hold,682,682,682,CA,Dublin,20,True,False,False,Gas
6,14bf1e8fba4efc1e28824c9de76c92a357e9a2b5,2018-12-06 17:30:00 UTC,auto,hold,690,735,685,CA,San Diego,40,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3440060,5a58643fd80355a54e6d6b9ea77de69e4c21d7f5,2018-12-11 18:50:00 UTC,cool,hold,723,760,760,CA,San Diego,10,False,False,False,Gas
3440061,5a3cf94345911475a2b20a6f0dca453152ac26a3,2018-12-10 15:55:00 UTC,auto,hold,683,760,680,CA,sunland,70,False,False,False,Gas
3440062,69743e7e9076114f6a99bea87449fd5c26429415,2018-12-11 17:55:00 UTC,auto,auto,701,760,710,CA,Bakersfield,10,False,False,False,Gas
3440063,173f28162391baf9d98bbda43848ac96a597095b,2018-12-24 17:20:00 UTC,auto,auto,668,760,670,CA,Sacramento,60,True,False,False,Gas


In [169]:
# Add year and month

dec_2018["Year"] = "2018"
dec_2018["Month"] = "dec"

In [170]:
# Rename columns to label the aggregates

dec_2018 = dec_2018.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [171]:
dec_2018_ave = dec_2018.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2018_ave

In [172]:
# Export CSV file

dec_2018_ave.to_csv("data/day/CA/dec/dec_2018_ave.csv", header=True, index=True)

### 2019 December Day

In [173]:
# Read in month csv for state
dec_2019 = pd.read_csv("../data_large/CA-day/2019-dec-day-CA.csv")

# dec_2019

In [174]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2019.drop(dec_2019[dec_2019['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2019

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,63a108637a000c7bb6950e2fc548b676cd290494,2019-12-11 15:55:00 UTC,heat,hold,706,710,710,CA,Newport Coast,20,False,False,False,Gas
1,2075f62872a11d492f3eb4fe99f7e32bbd505f3b,2019-12-02 16:35:00 UTC,auto,hold,668,820,670,CA,Hesperia,35,False,False,False,Gas
2,2fc296638bd04abd3337a3379f491cc70fcf35ea,2019-12-15 18:00:00 UTC,heat,hold,689,650,650,CA,Rancho Cucamonga,29,False,False,False,Gas
4,82f43755f89955288ec0cdd33c62ae5613d02043,2019-12-20 14:10:00 UTC,heat,auto,680,700,700,CA,San Ramon,16,True,False,False,Gas
6,c2d64a0ecdcc913a76b9c59f3541ccdd668bf7c2,2019-12-25 19:20:00 UTC,heat,auto,702,720,720,CA,Berkeley,107,False,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3967868,1aacca02aafee77b6c8c5b45e08eb56e71c2964c,2019-12-18 15:25:00 UTC,auto,hold,703,750,700,CA,Aliso Viejo,6,False,False,False,Gas
3967869,c970f1fe9757c1d483634072d8c352ec13a1e503,2019-12-20 07:55:00 UTC,heat,hold,695,705,705,CA,Los Angeles,0,False,False,False,Gas
3967870,d9b2b5536c7eeee442c05a78ca33d3e899a169fe,2019-12-05 13:10:00 UTC,auto,hold,617,780,620,CA,Fairfield,30,False,False,False,Gas
3967871,94ed5249f65dbb86eedce07230e02d6dfb560465,2019-12-02 19:45:00 UTC,heat,auto,704,710,710,CA,Oakland,90,False,False,False,Gas


In [175]:
# Add year and month

dec_2019["Year"] = "2019"
dec_2019["Month"] = "dec"

In [176]:
# Rename columns to label the aggregates

dec_2019 = dec_2019.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [177]:
dec_2019_ave = dec_2019.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2019_ave

In [178]:
# Export CSV file

dec_2019_ave.to_csv("data/day/CA/dec/dec_2019_ave.csv", header=True, index=True)

### 2020 December Day

In [179]:
# Read in month csv for state
dec_2020 = pd.read_csv("../data_large/CA-day/2020-dec-day-CA.csv")

# dec_2020

In [180]:
# Remove predetermined outliers before aggregating.
# TemperatureExpectedCool >= 850
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedCool'] >= 850].index, inplace = True)
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedHeat'] >= 850].index, inplace = True)

# TemperatureExpectedHeat <= 600
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedCool'] <= 600].index, inplace = True)
dec_2020.drop(dec_2020[dec_2020['TemperatureExpectedHeat'] <= 600].index, inplace = True)

dec_2020

Unnamed: 0,Identifier,date_time,HvacMode,CalendarEvent,Temperature_ctrl,TemperatureExpectedCool,TemperatureExpectedHeat,ProvinceState,City,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump,Auxilliary_Heat_Fuel_Type
0,ff8e396bf4ed5de3f60fcc5af3c4a86f2231befc,2020-12-31 14:50:00 UTC,heat,hold,640,687,637,CA,Fresno,25,False,False,False,Gas
1,71def1c3d531989de9cd0cbb96c3a85bd4b85157,2020-12-01 19:25:00 UTC,auto,hold,706,829,709,CA,Cupertino,45,False,False,False,Gas
2,e26e72a49cb2a0aed7f2068a8319e579ec830763,2020-12-03 16:10:00 UTC,auto,hold,684,731,681,CA,Calabasas,40,False,False,False,Gas
3,ba976413a2c0048332821869f1a0780dcbd661fc,2020-12-18 07:00:00 UTC,heat,auto,736,731,731,CA,Alameda,5,False,False,False,Gas
5,9f1f363e3a4d4922038d8876e0d6266c043e606e,2020-12-02 18:10:00 UTC,heat,auto,679,665,660,CA,San Jose,49,True,False,False,Gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264776,9fa838c06e73e6053f936c683284fa70f0283984,2020-12-10 19:15:00 UTC,auto,hold,705,760,690,CA,Huntington Beach,0,False,False,False,Gas
3264777,8a3c8a4de4f1f3be88c066bd73de717fb3a8a66a,2020-12-16 13:35:00 UTC,heat,hold,752,760,760,CA,Glendale,0,True,False,False,Gas
3264778,951ff4f6bfe0d1474b5aa7318059ac4c9dd6f602,2020-12-20 18:10:00 UTC,heat,hold,757,760,760,CA,Union City,30,True,False,False,Gas
3264779,c700d9664e1b86a83f1e3d2949e3f1d0e774d117,2020-12-25 18:15:00 UTC,heat,hold,696,760,760,CA,Los Angeles CA,120,True,False,False,Gas


In [181]:
# Add year and month

dec_2020["Year"] = "2020"
dec_2020["Month"] = "dec"

In [182]:
# Rename columns to label the aggregates

dec_2020 = dec_2020.rename(columns={"Temperature_ctrl":"Temperature_ctrl_ave", 
                         "TemperatureExpectedCool":"TemperatureExpectedCool_ave", 
                         "TemperatureExpectedHeat":"TemperatureExpectedHeat_ave"})

In [183]:
dec_2020_ave = dec_2020.groupby(['Identifier', 'Month', 'Year','HvacMode','CalendarEvent', 'City']).mean()

# dec_2020_ave

In [184]:
# Export CSV file

dec_2020_ave.to_csv("data/day/CA/dec/dec_2020_ave.csv", header=True, index=True)

---

### Combine month CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [185]:
# Create variable for files in directory
files = [f for f in os.listdir("data/day/CA/dec/") if f.endswith(".csv")]

# files

In [186]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_dec = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/day/CA/dec/" + file)
    CA_dec = pd.concat([CA_dec, df])
    
CA_dec

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,001cb18881c5ace2d453720abb0d8fe710e80a29,dec,2017,heat,hold,San Diego,743.500000,750.000000,740.833333,50.0,False,False,False
1,0029564391031ecb03549e6389960db4eed71126,dec,2017,auto,auto,San Gabriel,673.000000,760.000000,681.285714,90.0,False,False,False
2,0029564391031ecb03549e6389960db4eed71126,dec,2017,auto,hold,San Gabriel,703.111111,760.000000,695.283951,90.0,False,False,False
3,0029564391031ecb03549e6389960db4eed71126,dec,2017,heat,hold,San Gabriel,706.818182,760.000000,710.000000,90.0,False,False,False
4,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,dec,2017,heat,auto,San Marcos,704.126582,704.571730,703.221519,15.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7753,ffdb74abccd27a50a47d411800653c4b0e702b49,dec,2020,heat,hold,San Diego,677.742545,669.472167,669.472167,9.0,True,False,False
7754,ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,dec,2020,heat,auto,Chico,708.833333,722.166667,716.416667,0.0,False,False,False
7755,ffe0ca2d4420d9d31d0fd8fa1b971cf43f393141,dec,2020,heat,hold,Chico,708.320000,709.120000,705.280000,0.0,False,False,False
7756,fff55efcffca0c40f120b9fd24d8d3b88dcef965,dec,2020,auto,auto,Winchester,676.403909,788.071661,682.537459,10.0,False,False,False


In [187]:
CA_dec.to_csv("Scraper_Output/State_Month_Day/CA/CA_dec.csv", header=True, index=False)

----

----

---

### Combine state CSV Files 
1. Read in files in folders for each state
2. Export as combined CSV

In [188]:
# Create variable for files in directory
files = [f for f in os.listdir("Scraper_Output/State_Month_Day/CA/") if f.endswith(".csv")]

# files

In [189]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize an empty dataframe
CA_all = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("Scraper_Output/State_Month_Day/CA/" + file)
    CA_all = pd.concat([CA_all, df])
    
CA_all

Unnamed: 0,Identifier,Month,Year,HvacMode,CalendarEvent,City,Temperature_ctrl_ave,TemperatureExpectedCool_ave,TemperatureExpectedHeat_ave,Age_of_Home__years_,allowCompWithAux,Has_Electric,Has_a_Heat_Pump
0,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,aug,2017,cool,auto,San Marcos,752.976190,752.535714,680.000000,15.0,False,False,False
1,002aab87db1d9e8ce470f80f23c97c59de5f4dcd,aug,2017,cool,hold,San Marcos,763.400000,770.000000,750.000000,15.0,False,False,False
2,0050d013d4928f706c90379b28f9ab257178f1a8,aug,2017,cool,auto,Dublin,768.066225,818.357616,626.370861,5.0,False,False,False
3,0050d013d4928f706c90379b28f9ab257178f1a8,aug,2017,cool,hold,Dublin,766.625000,770.000000,770.000000,5.0,False,False,False
4,007f930abce4596b2214cd79a09714b3d482e17d,aug,2017,cool,auto,Escondido,765.244681,770.329787,630.000000,0.0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27284,ff573f53253c7fcae2af9ffaae465435f5e773ca,jun,2021,cool,hold,Los Angeles,744.945736,745.000000,745.000000,60.0,False,False,False
27285,ff7fa20dc4a160fba797e8a11b8b47ccc8494686,jun,2021,cool,hold,San Diego,770.000000,705.000000,720.000000,55.0,False,False,False
27286,ffab10c348bdc935f2b26c06b99e75d237f5cead,jun,2021,auto,hold,Visalia,790.095440,805.426892,651.884344,97.0,False,False,False
27287,ffd99f8a17ac8f96739ceba7c0feca750b886605,jun,2021,auto,hold,San Diego,742.539216,743.565359,668.496732,40.0,False,False,False


In [190]:
CA_all.to_csv("Scraper_Output/State_Month_Day/CA_all_day.csv", header=True, index=False)

In [191]:
# Datacheck to mCAe sure state was selected correctly in BQ sql queries

print(f"Unique jan_2017: {jan_2017['ProvinceState'].unique()}")
print(f"Unique jan_2018: {jan_2018['ProvinceState'].unique()}")
print(f"Unique jan_2019: {jan_2019['ProvinceState'].unique()}")
print(f"Unique jan_2020: {jan_2020['ProvinceState'].unique()}")
print(f"Unique jan_2021: {jan_2021['ProvinceState'].unique()}")
print(f"Unique feb_2017: {feb_2017['ProvinceState'].unique()}")
print(f"Unique feb_2018: {feb_2018['ProvinceState'].unique()}")
print(f"Unique feb_2019: {feb_2019['ProvinceState'].unique()}")
print(f"Unique feb_2020: {feb_2020['ProvinceState'].unique()}")
print(f"Unique feb_2021: {feb_2021['ProvinceState'].unique()}")
print(f"Unique jun_2017: {jun_2017['ProvinceState'].unique()}")
print(f"Unique jun_2018: {jun_2018['ProvinceState'].unique()}")
print(f"Unique jun_2019: {jun_2019['ProvinceState'].unique()}")
print(f"Unique jun_2020: {jun_2020['ProvinceState'].unique()}")
print(f"Unique jun_2021: {jun_2021['ProvinceState'].unique()}")
print(f"Unique jul_2017: {jul_2017['ProvinceState'].unique()}")
print(f"Unique jul_2018: {jul_2018['ProvinceState'].unique()}")
print(f"Unique jul_2019: {jul_2019['ProvinceState'].unique()}")
print(f"Unique jul_2020: {jul_2020['ProvinceState'].unique()}")
print(f"Unique jul_2021: {jul_2021['ProvinceState'].unique()}")
print(f"Unique aug_2017: {aug_2017['ProvinceState'].unique()}")
print(f"Unique aug_2018: {aug_2018['ProvinceState'].unique()}")
print(f"Unique aug_2019: {aug_2019['ProvinceState'].unique()}")
print(f"Unique aug_2020: {aug_2020['ProvinceState'].unique()}")
print(f"Unique dec_2017: {dec_2017['ProvinceState'].unique()}")
print(f"Unique dec_2018: {dec_2018['ProvinceState'].unique()}")
print(f"Unique dec_2019: {dec_2019['ProvinceState'].unique()}")
print(f"Unique dec_2020: {dec_2020['ProvinceState'].unique()}")

Unique jan_2017: ['CA']
Unique jan_2018: ['CA']
Unique jan_2019: ['CA']
Unique jan_2020: ['CA']
Unique jan_2021: ['CA']
Unique feb_2017: ['CA']
Unique feb_2018: ['CA']
Unique feb_2019: ['CA']
Unique feb_2020: ['CA']
Unique feb_2021: ['CA']
Unique jun_2017: ['CA']
Unique jun_2018: ['CA']
Unique jun_2019: ['CA']
Unique jun_2020: ['CA']
Unique jun_2021: ['CA']
Unique jul_2017: ['CA']
Unique jul_2018: ['CA']
Unique jul_2019: ['CA']
Unique jul_2020: ['CA']
Unique jul_2021: ['CA']
Unique aug_2017: ['CA']
Unique aug_2018: ['CA']
Unique aug_2019: ['CA']
Unique aug_2020: ['CA']
Unique dec_2017: ['CA']
Unique dec_2018: ['CA']
Unique dec_2019: ['CA']
Unique dec_2020: ['CA']
