# DYD Thermostat Data 

## Preprocess

1. Generated CSV file from queries in BigQueary

2. Data separated into years (2017, 2018, 2019)

3. Aggregated the data in BigQuery by vintage of home (every 5 years)



In [1]:
# Dependencies
import pandas as pd
import os
import numpy as np
from pathlib import Path
from datetime import datetime

---

## Combine Annual CSV Files
1. Add date to each merged file, one for summer solstice and one for winter.
2. Merge the summer and winter files for each year

# 2017 Summer

In [2]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2017/summer/") if f.endswith(".csv")]

# files

In [3]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
summer_2017 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2017/summer/" + file)
    summer_2017 = pd.concat([summer_2017, df])
    
summer_2017

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,751.789216,751.372549,751.372549,d761e9f22aca4be5390b4c28dc5853556549f07b,US,AB,0,Home,hold
1,773.750000,772.500000,760.250000,ff5aa8f5b0f3c532a276504417b65e822c593794,US,AL,0,Home,hold
2,722.166667,721.720588,721.720588,2a2d29f3dc6cfb557b057a1c3d039bd66a640ce3,US,AL,0,Home,hold
3,722.072727,720.927273,718.909091,b57a70fe011f0ca2070f85221ea90fe4952790eb,US,AL,0,Home,hold
4,734.846154,738.846154,738.804734,01d4384fb6e0bc8f9d2d347ac245408fed66e25a,US,AL,0,Home,hold
...,...,...,...,...,...,...,...,...,...
2138,781.010870,780.836957,780.836957,f19f60d5fa93f4c9f89feba311935642285356d6,US,TX,120,Home,hold
2139,790.250000,789.600000,789.600000,f19f60d5fa93f4c9f89feba311935642285356d6,US,TX,120,Home,auto
2140,784.136986,780.068493,709.767123,58b9cb6b3c3f8a56eb691093ea1cac439c4f8898,US,TX,120,Home,hold
2141,763.666667,721.762712,654.203390,99505e83dbcacbb5e8f0b6ec9234530fde442164,US,UT,120,Home,hold


In [4]:
# See all the unique names for states

summer_2017.ProvinceState.unique()

array(['AB', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'IA', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO',
       'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK',
       'OR', 'PA', 'RI', 'SC', 'TN', 'TX', 'UT', 'VA', 'WA', 'WI', 'WV',
       'ME', 'SD', 'AK', 'VT', 'ID', 'QC', 'MT', 'PR', 'WY', 'HI'],
      dtype=object)

In [5]:
# Remove weird states

summer_2017 = summer_2017[summer_2017["ProvinceState"] != "AB"]
summer_2017 = summer_2017[summer_2017["ProvinceState"] != "VI"]
summer_2017 = summer_2017[summer_2017["ProvinceState"] != "ON"]
summer_2017 = summer_2017[summer_2017["ProvinceState"] != "NB"]
summer_2017 = summer_2017[summer_2017["ProvinceState"] != "YT"]
summer_2017 = summer_2017[summer_2017["ProvinceState"] != "QC"]

In [6]:
# See all the unique names for states

summer_2017.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
       'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO', 'MS',
       'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR',
       'PA', 'RI', 'SC', 'TN', 'TX', 'UT', 'VA', 'WA', 'WI', 'WV', 'ME',
       'SD', 'AK', 'VT', 'ID', 'MT', 'PR', 'WY', 'HI'], dtype=object)

In [7]:
# Drop NANs

summer_2017 = summer_2017.dropna()

In [8]:
# Add summer solstic date

summer_2017["Date"] = "06/21/2017"

summer_2017

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
1,773.750000,772.500000,760.250000,ff5aa8f5b0f3c532a276504417b65e822c593794,US,AL,0,Home,hold,06/21/2017
2,722.166667,721.720588,721.720588,2a2d29f3dc6cfb557b057a1c3d039bd66a640ce3,US,AL,0,Home,hold,06/21/2017
3,722.072727,720.927273,718.909091,b57a70fe011f0ca2070f85221ea90fe4952790eb,US,AL,0,Home,hold,06/21/2017
4,734.846154,738.846154,738.804734,01d4384fb6e0bc8f9d2d347ac245408fed66e25a,US,AL,0,Home,hold,06/21/2017
5,774.434783,786.956522,673.043478,ff5aa8f5b0f3c532a276504417b65e822c593794,US,AL,0,Home,auto,06/21/2017
...,...,...,...,...,...,...,...,...,...,...
2138,781.010870,780.836957,780.836957,f19f60d5fa93f4c9f89feba311935642285356d6,US,TX,120,Home,hold,06/21/2017
2139,790.250000,789.600000,789.600000,f19f60d5fa93f4c9f89feba311935642285356d6,US,TX,120,Home,auto,06/21/2017
2140,784.136986,780.068493,709.767123,58b9cb6b3c3f8a56eb691093ea1cac439c4f8898,US,TX,120,Home,hold,06/21/2017
2141,763.666667,721.762712,654.203390,99505e83dbcacbb5e8f0b6ec9234530fde442164,US,UT,120,Home,hold,06/21/2017


# 2017 Winter

In [9]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2017/winter/") if f.endswith(".csv")]

# files

In [10]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
winter_2017 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2017/winter/" + file)
    winter_2017 = pd.concat([winter_2017, df])
    
winter_2017

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,685.666667,699.505747,699.505747,42cf544eb2c5286af12e3146cf4ee1d31faec762,US,AL,0,Home,hold
1,724.500000,719.062500,719.062500,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto
2,687.670139,681.812500,681.812500,76adfc0b71b53cf860f44694f636ec41667dde79,US,AL,0,Home,hold
3,693.023256,751.511628,693.023256,113ff6024187ee120b0a37d1f32312cc1082f2a9,US,AL,0,Home,hold
4,649.136364,650.454545,600.409091,1ac98223b24bd6122495ad447cfcfe02fa2bba7e,US,AL,0,Home,auto
...,...,...,...,...,...,...,...,...,...
3870,553.000000,650.000000,608.000000,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold
3871,669.400000,695.400000,695.400000,6a5c72f744242e0fe1b5c89734fb53e692a08655,US,VT,120,Home,hold
3872,679.594595,680.648649,679.513514,840997521015e92d1a4fe9ed4f08b71e96a9f2b9,US,WI,120,Home,hold
3873,657.711111,652.666667,645.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,hold


In [11]:
# See all the unique names for states

winter_2017.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
       'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'WA', 'WI', 'WV', 'WY', 'AB', 'VT', 'QC', 'MT', 'AK', 'HI'],
      dtype=object)

In [12]:
# Remove weird states

winter_2017 = winter_2017[winter_2017["ProvinceState"] != "AB"]
winter_2017 = winter_2017[winter_2017["ProvinceState"] != "VI"]
winter_2017 = winter_2017[winter_2017["ProvinceState"] != "ON"]
winter_2017 = winter_2017[winter_2017["ProvinceState"] != "NB"]
winter_2017 = winter_2017[winter_2017["ProvinceState"] != "YT"]
winter_2017 = winter_2017[winter_2017["ProvinceState"] != "QC"]

In [13]:
# See all the unique names for states

winter_2017.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
       'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
       'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'WA', 'WI', 'WV', 'WY', 'VT', 'MT', 'AK', 'HI'], dtype=object)

In [14]:
# Drop NANs

winter_2017 = winter_2017.dropna()

In [15]:
# Add summer solstic date

winter_2017["Date"] = "12/21/2017"

winter_2017

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,685.666667,699.505747,699.505747,42cf544eb2c5286af12e3146cf4ee1d31faec762,US,AL,0,Home,hold,12/21/2017
1,724.500000,719.062500,719.062500,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto,12/21/2017
2,687.670139,681.812500,681.812500,76adfc0b71b53cf860f44694f636ec41667dde79,US,AL,0,Home,hold,12/21/2017
3,693.023256,751.511628,693.023256,113ff6024187ee120b0a37d1f32312cc1082f2a9,US,AL,0,Home,hold,12/21/2017
4,649.136364,650.454545,600.409091,1ac98223b24bd6122495ad447cfcfe02fa2bba7e,US,AL,0,Home,auto,12/21/2017
...,...,...,...,...,...,...,...,...,...,...
3870,553.000000,650.000000,608.000000,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold,12/21/2017
3871,669.400000,695.400000,695.400000,6a5c72f744242e0fe1b5c89734fb53e692a08655,US,VT,120,Home,hold,12/21/2017
3872,679.594595,680.648649,679.513514,840997521015e92d1a4fe9ed4f08b71e96a9f2b9,US,WI,120,Home,hold,12/21/2017
3873,657.711111,652.666667,645.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,hold,12/21/2017


In [16]:
All2017 = summer_2017.append(winter_2017, ignore_index=True)

In [17]:
All2017

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,773.750000,772.500000,760.250000,ff5aa8f5b0f3c532a276504417b65e822c593794,US,AL,0,Home,hold,06/21/2017
1,722.166667,721.720588,721.720588,2a2d29f3dc6cfb557b057a1c3d039bd66a640ce3,US,AL,0,Home,hold,06/21/2017
2,722.072727,720.927273,718.909091,b57a70fe011f0ca2070f85221ea90fe4952790eb,US,AL,0,Home,hold,06/21/2017
3,734.846154,738.846154,738.804734,01d4384fb6e0bc8f9d2d347ac245408fed66e25a,US,AL,0,Home,hold,06/21/2017
4,774.434783,786.956522,673.043478,ff5aa8f5b0f3c532a276504417b65e822c593794,US,AL,0,Home,auto,06/21/2017
...,...,...,...,...,...,...,...,...,...,...
37496,553.000000,650.000000,608.000000,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold,12/21/2017
37497,669.400000,695.400000,695.400000,6a5c72f744242e0fe1b5c89734fb53e692a08655,US,VT,120,Home,hold,12/21/2017
37498,679.594595,680.648649,679.513514,840997521015e92d1a4fe9ed4f08b71e96a9f2b9,US,WI,120,Home,hold,12/21/2017
37499,657.711111,652.666667,645.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,hold,12/21/2017


In [18]:
# Export  file to csv

All2017.to_csv("Scraper_Output/2017-US.csv", header=True, index=False)

# 2018 Summer

In [19]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2018/summer/") if f.endswith(".csv")]

# files

In [20]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
summer_2018 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2018/summer/" + file)
    summer_2018 = pd.concat([summer_2018, df])
    
summer_2018

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,671.057143,658.438095,658.438095,6e0c846955b44158b1df0c9d8bc3369c2c008b54,US,AL,0,Home,auto
1,723.427536,736.521739,736.521739,e53951da81d6faaf29e0c9b8c132e6d7fe8efe74,US,AL,0,Home,auto
2,727.063725,722.813725,722.813725,90a22524918723670df60f9904073a4836be3417,US,AL,0,Home,hold
3,719.048780,714.065041,733.642276,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto
4,689.500000,719.375000,659.687500,adaf55267563a067b9ccf8d074d692030caeeead,US,AL,0,Home,hold
...,...,...,...,...,...,...,...,...,...
4357,682.006944,680.402778,679.951389,99505e83dbcacbb5e8f0b6ec9234530fde442164,US,UT,120,Home,hold
4358,707.379747,650.506329,650.101266,87bb045fe5e6ea88a72285a568c78d6d6d3875d8,US,UT,120,Home,hold
4359,713.660000,820.000000,600.000000,463e091043b222b3e88af924abd32a8b3e5042e2,US,VT,120,Home,auto
4360,705.122549,680.000000,680.000000,ef48201165ee7b066fdd68f2927860cd048658f7,US,VT,120,Home,hold


In [21]:
# See all the unique names for states

summer_2018.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN',
       'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT',
       'WA', 'WI', 'WV', 'WY', 'AB', 'MT', 'QC', 'AK', 'ME', 'PR'],
      dtype=object)

In [22]:
# Remove weird states

summer_2018 = summer_2018[summer_2018["ProvinceState"] != "AB"]
summer_2018 = summer_2018[summer_2018["ProvinceState"] != "VI"]
summer_2018 = summer_2018[summer_2018["ProvinceState"] != "ON"]
summer_2018 = summer_2018[summer_2018["ProvinceState"] != "NB"]
summer_2018 = summer_2018[summer_2018["ProvinceState"] != "YT"]
summer_2018 = summer_2018[summer_2018["ProvinceState"] != "QC"]

In [23]:
# See all the unique names for states

summer_2018.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN',
       'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT',
       'WA', 'WI', 'WV', 'WY', 'MT', 'AK', 'ME', 'PR'], dtype=object)

In [24]:
# Drop NANs

summer_2018 = summer_2018.dropna()

In [25]:
# Add summer solstic date

summer_2018["Date"] = "06/21/2018"

summer_2018

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,671.057143,658.438095,658.438095,6e0c846955b44158b1df0c9d8bc3369c2c008b54,US,AL,0,Home,auto,06/21/2018
1,723.427536,736.521739,736.521739,e53951da81d6faaf29e0c9b8c132e6d7fe8efe74,US,AL,0,Home,auto,06/21/2018
2,727.063725,722.813725,722.813725,90a22524918723670df60f9904073a4836be3417,US,AL,0,Home,hold,06/21/2018
3,719.048780,714.065041,733.642276,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto,06/21/2018
4,689.500000,719.375000,659.687500,adaf55267563a067b9ccf8d074d692030caeeead,US,AL,0,Home,hold,06/21/2018
...,...,...,...,...,...,...,...,...,...,...
4357,682.006944,680.402778,679.951389,99505e83dbcacbb5e8f0b6ec9234530fde442164,US,UT,120,Home,hold,06/21/2018
4358,707.379747,650.506329,650.101266,87bb045fe5e6ea88a72285a568c78d6d6d3875d8,US,UT,120,Home,hold,06/21/2018
4359,713.660000,820.000000,600.000000,463e091043b222b3e88af924abd32a8b3e5042e2,US,VT,120,Home,auto,06/21/2018
4360,705.122549,680.000000,680.000000,ef48201165ee7b066fdd68f2927860cd048658f7,US,VT,120,Home,hold,06/21/2018


# 2018 Winter

In [26]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2018/winter/") if f.endswith(".csv")]

# files

In [27]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
winter_2018 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2018/winter/" + file)
    winter_2018 = pd.concat([winter_2018, df])
    
winter_2018

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,712.210526,716.473684,719.000000,6205ac4016e84a3ececb4497ecebaae4562994a2,US,AL,0,Home,hold
1,710.392361,734.461806,684.461806,d134f189ab8e0994c2700d6ee3386bdae27ad070,US,AL,0,Home,hold
2,729.261111,783.833333,731.277778,f7413b91711ef42f10f957bd3da07ac9aa595fb9,US,AL,0,Home,hold
3,688.454545,746.727273,694.000000,5390ca096ac4b13e25fc5f63d3704ff5401d8c04,US,AL,0,Home,hold
4,704.188889,704.177778,704.177778,89164eb8a72a45c0a929cf6413e4cb19b2f411e4,US,AL,0,Home,hold
...,...,...,...,...,...,...,...,...,...
5761,619.444444,650.000000,640.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto
5762,628.408377,664.565445,638.848168,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold
5763,671.390244,691.634146,689.634146,463e091043b222b3e88af924abd32a8b3e5042e2,US,VT,120,Home,hold
5764,677.193548,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold


In [28]:
# See all the unique names for states

winter_2018.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI',
       'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV',
       'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
       'VA', 'VT', 'WA', 'WI', 'WV', 'AB', 'QC', 'AK', 'WY', 'PR', 'NB',
       nan], dtype=object)

In [29]:
# Remove weird states

winter_2018 = winter_2018[winter_2018["ProvinceState"] != "AB"]
winter_2018 = winter_2018[winter_2018["ProvinceState"] != "VI"]
winter_2018 = winter_2018[winter_2018["ProvinceState"] != "ON"]
winter_2018 = winter_2018[winter_2018["ProvinceState"] != "NB"]
winter_2018 = winter_2018[winter_2018["ProvinceState"] != "YT"]
winter_2018 = winter_2018[winter_2018["ProvinceState"] != "QC"]

In [30]:
# See all the unique names for states

winter_2018.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI',
       'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV',
       'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
       'VA', 'VT', 'WA', 'WI', 'WV', 'AK', 'WY', 'PR', nan], dtype=object)

In [31]:
# Drop NANs

winter_2018 = winter_2018.dropna()

In [32]:
# Add summer solstic date

winter_2018["Date"] = "12/21/2018"

winter_2018

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,712.210526,716.473684,719.000000,6205ac4016e84a3ececb4497ecebaae4562994a2,US,AL,0,Home,hold,12/21/2018
1,710.392361,734.461806,684.461806,d134f189ab8e0994c2700d6ee3386bdae27ad070,US,AL,0,Home,hold,12/21/2018
2,729.261111,783.833333,731.277778,f7413b91711ef42f10f957bd3da07ac9aa595fb9,US,AL,0,Home,hold,12/21/2018
3,688.454545,746.727273,694.000000,5390ca096ac4b13e25fc5f63d3704ff5401d8c04,US,AL,0,Home,hold,12/21/2018
4,704.188889,704.177778,704.177778,89164eb8a72a45c0a929cf6413e4cb19b2f411e4,US,AL,0,Home,hold,12/21/2018
...,...,...,...,...,...,...,...,...,...,...
5761,619.444444,650.000000,640.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto,12/21/2018
5762,628.408377,664.565445,638.848168,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold,12/21/2018
5763,671.390244,691.634146,689.634146,463e091043b222b3e88af924abd32a8b3e5042e2,US,VT,120,Home,hold,12/21/2018
5764,677.193548,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold,12/21/2018


In [33]:
All2018 = summer_2018.append(winter_2018, ignore_index=True)

In [34]:
All2018

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,671.057143,658.438095,658.438095,6e0c846955b44158b1df0c9d8bc3369c2c008b54,US,AL,0,Home,auto,06/21/2018
1,723.427536,736.521739,736.521739,e53951da81d6faaf29e0c9b8c132e6d7fe8efe74,US,AL,0,Home,auto,06/21/2018
2,727.063725,722.813725,722.813725,90a22524918723670df60f9904073a4836be3417,US,AL,0,Home,hold,06/21/2018
3,719.048780,714.065041,733.642276,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto,06/21/2018
4,689.500000,719.375000,659.687500,adaf55267563a067b9ccf8d074d692030caeeead,US,AL,0,Home,hold,06/21/2018
...,...,...,...,...,...,...,...,...,...,...
64311,619.444444,650.000000,640.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto,12/21/2018
64312,628.408377,664.565445,638.848168,dff152b834051b8c400a572af119b26eb79c03c6,US,VT,120,Home,hold,12/21/2018
64313,671.390244,691.634146,689.634146,463e091043b222b3e88af924abd32a8b3e5042e2,US,VT,120,Home,hold,12/21/2018
64314,677.193548,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold,12/21/2018


In [35]:
# Export  file to csv

All2018.to_csv("Scraper_Output/2018-US.csv", header=True, index=False)

# 2019 Summer

In [36]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2019/summer/") if f.endswith(".csv")]

# files

In [37]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
summer_2019 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2019/summer/" + file)
    summer_2019 = pd.concat([summer_2019, df])
    
summer_2019

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,725.881579,725.328947,725.328947,1147542975a299d62794f420f6f2c2010c9afe24,US,AL,0,Home,auto
1,783.500000,729.500000,674.500000,59433a90a111551a01b93a41b293f89a44965e39,US,AL,0,Home,auto
2,729.900000,710.000000,647.000000,191a7b7beca9b009418c36a8c490c18c9b360d61,US,AL,0,Home,auto
3,727.924242,723.737374,723.737374,cfa4dd57be76041d249536ace521369e12bf07b5,US,AL,0,Home,auto
4,732.073529,730.323529,699.647059,d8e7d7fafd75dd8ee77ae93b7548a7b2d55fe513,US,AL,0,Home,auto
...,...,...,...,...,...,...,...,...,...
5462,688.500000,750.000000,690.750000,748db8c77fb545f1dacdea126b94bb0fa01e7ba3,US,WA,120,Home,hold
5463,695.370370,669.888889,670.111111,13705951259afff049e8388c52a54f168e64db46,US,WI,120,Home,hold
5464,738.648649,760.064865,689.837838,8c3b32073400f2b25ee858a80a5b49c36ee422c0,US,WI,120,Home,hold
5465,652.224490,650.000000,600.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,auto


In [38]:
# See all the unique names for states

summer_2019.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI',
       'MN', 'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'PR', 'QC', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'MT', 'AB', 'ON', 'AK',
       'NB', nan], dtype=object)

In [39]:
# Remove weird states

summer_2019 = summer_2019[summer_2019["ProvinceState"] != "AB"]
summer_2019 = summer_2019[summer_2019["ProvinceState"] != "VI"]
summer_2019 = summer_2019[summer_2019["ProvinceState"] != "ON"]
summer_2019 = summer_2019[summer_2019["ProvinceState"] != "NB"]
summer_2019 = summer_2019[summer_2019["ProvinceState"] != "YT"]
summer_2019 = summer_2019[summer_2019["ProvinceState"] != "QC"]

In [40]:
# See all the unique names for states

summer_2019.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI',
       'MN', 'MO', 'MS', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
       'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'MT', 'AK', nan], dtype=object)

In [41]:
# Drop NANs

summer_2019 = summer_2019.dropna()

In [42]:
# Add summer solstic date

summer_2019["Date"] = "06/21/2019"

summer_2019

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,725.881579,725.328947,725.328947,1147542975a299d62794f420f6f2c2010c9afe24,US,AL,0,Home,auto,06/21/2019
1,783.500000,729.500000,674.500000,59433a90a111551a01b93a41b293f89a44965e39,US,AL,0,Home,auto,06/21/2019
2,729.900000,710.000000,647.000000,191a7b7beca9b009418c36a8c490c18c9b360d61,US,AL,0,Home,auto,06/21/2019
3,727.924242,723.737374,723.737374,cfa4dd57be76041d249536ace521369e12bf07b5,US,AL,0,Home,auto,06/21/2019
4,732.073529,730.323529,699.647059,d8e7d7fafd75dd8ee77ae93b7548a7b2d55fe513,US,AL,0,Home,auto,06/21/2019
...,...,...,...,...,...,...,...,...,...,...
5462,688.500000,750.000000,690.750000,748db8c77fb545f1dacdea126b94bb0fa01e7ba3,US,WA,120,Home,hold,06/21/2019
5463,695.370370,669.888889,670.111111,13705951259afff049e8388c52a54f168e64db46,US,WI,120,Home,hold,06/21/2019
5464,738.648649,760.064865,689.837838,8c3b32073400f2b25ee858a80a5b49c36ee422c0,US,WI,120,Home,hold,06/21/2019
5465,652.224490,650.000000,600.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,auto,06/21/2019


# 2019 Winter

In [43]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2019/winter/") if f.endswith(".csv")]

# files

In [44]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
winter_2019 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2019/winter/" + file)
    winter_2019 = pd.concat([winter_2019, df])
    
winter_2019

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,739.049020,740.955882,740.955882,d761e9f22aca4be5390b4c28dc5853556549f07b,US,AB,0,Home,hold
1,695.432432,700.405405,699.837838,fda1f67e8423ed49933c45617ee96aef3bae04ba,US,AK,0,Home,auto
2,686.631579,690.157895,690.157895,0404ace1bcdf88d917bd860a04cd6c66fb51679f,US,AK,0,Home,hold
3,681.500000,850.000000,681.500000,c5a6a4cea737d0070ba4345b465cf1c27f6b5b8a,US,AL,0,Home,auto
4,717.250000,850.000000,710.000000,42ebc5d1a357442caba48b3ef0e0cf67e6bcef55,US,AL,0,Home,auto
...,...,...,...,...,...,...,...,...,...
7034,621.360000,650.000000,622.560000,fe4b4fe6f50d63b942ddfe83cbcfc08d51ef999b,US,VT,120,Home,hold
7035,673.000000,740.000000,710.000000,6d578aaea9293ac052c317e1dedd3f6f62dc49a4,US,WI,120,Home,auto
7036,687.854839,690.000000,690.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold
7037,673.441860,680.000000,680.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,auto


In [45]:
# See all the unique names for states

winter_2019.ProvinceState.unique()

array(['AB', 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
       'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD',
       'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ',
       'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY', 'QC',
       'ON', 'NB', 'YT', nan], dtype=object)

In [46]:
# Remove weird states

winter_2019 = winter_2019[winter_2019["ProvinceState"] != "AB"]
winter_2019 = winter_2019[winter_2019["ProvinceState"] != "VI"]
winter_2019 = winter_2019[winter_2019["ProvinceState"] != "ON"]
winter_2019 = winter_2019[winter_2019["ProvinceState"] != "NB"]
winter_2019 = winter_2019[winter_2019["ProvinceState"] != "YT"]
winter_2019 = winter_2019[winter_2019["ProvinceState"] != "QC"]

In [47]:
# See all the unique names for states

winter_2019.ProvinceState.unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', nan], dtype=object)

In [48]:
# Drop NANs

winter_2019 = winter_2019.dropna()

In [49]:
# Add summer solstic date

winter_2019["Date"] = "12/21/2019"

winter_2019

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
1,695.432432,700.405405,699.837838,fda1f67e8423ed49933c45617ee96aef3bae04ba,US,AK,0,Home,auto,12/21/2019
2,686.631579,690.157895,690.157895,0404ace1bcdf88d917bd860a04cd6c66fb51679f,US,AK,0,Home,hold,12/21/2019
3,681.500000,850.000000,681.500000,c5a6a4cea737d0070ba4345b465cf1c27f6b5b8a,US,AL,0,Home,auto,12/21/2019
4,717.250000,850.000000,710.000000,42ebc5d1a357442caba48b3ef0e0cf67e6bcef55,US,AL,0,Home,auto,12/21/2019
5,640.151515,652.430303,643.600000,a4535ee96485739f83aa86206342d24a3602bfdb,US,AL,0,Home,auto,12/21/2019
...,...,...,...,...,...,...,...,...,...,...
7034,621.360000,650.000000,622.560000,fe4b4fe6f50d63b942ddfe83cbcfc08d51ef999b,US,VT,120,Home,hold,12/21/2019
7035,673.000000,740.000000,710.000000,6d578aaea9293ac052c317e1dedd3f6f62dc49a4,US,WI,120,Home,auto,12/21/2019
7036,687.854839,690.000000,690.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold,12/21/2019
7037,673.441860,680.000000,680.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,auto,12/21/2019


In [50]:
All2019 = summer_2019.append(winter_2019, ignore_index=True)

In [51]:
All2019

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,725.881579,725.328947,725.328947,1147542975a299d62794f420f6f2c2010c9afe24,US,AL,0,Home,auto,06/21/2019
1,783.500000,729.500000,674.500000,59433a90a111551a01b93a41b293f89a44965e39,US,AL,0,Home,auto,06/21/2019
2,729.900000,710.000000,647.000000,191a7b7beca9b009418c36a8c490c18c9b360d61,US,AL,0,Home,auto,06/21/2019
3,727.924242,723.737374,723.737374,cfa4dd57be76041d249536ace521369e12bf07b5,US,AL,0,Home,auto,06/21/2019
4,732.073529,730.323529,699.647059,d8e7d7fafd75dd8ee77ae93b7548a7b2d55fe513,US,AL,0,Home,auto,06/21/2019
...,...,...,...,...,...,...,...,...,...,...
81666,621.360000,650.000000,622.560000,fe4b4fe6f50d63b942ddfe83cbcfc08d51ef999b,US,VT,120,Home,hold,12/21/2019
81667,673.000000,740.000000,710.000000,6d578aaea9293ac052c317e1dedd3f6f62dc49a4,US,WI,120,Home,auto,12/21/2019
81668,687.854839,690.000000,690.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,hold,12/21/2019
81669,673.441860,680.000000,680.000000,4949e2207e8f59c14d94cb0938fa6195ab0c455a,US,WI,120,Home,auto,12/21/2019


In [52]:
# Export  file to csv

All2019.to_csv("Scraper_Output/2019-US.csv", header=True, index=False)

# 2020 Summer

In [53]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2020/summer/") if f.endswith(".csv")]

# files

In [54]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
summer_2020 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2020/summer/" + file)
    summer_2020 = pd.concat([summer_2020, df])
    
summer_2020

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,738.630137,740.109589,740.109589,d761e9f22aca4be5390b4c28dc5853556549f07b,US,AB,0,Home,auto
1,758.312977,759.954198,759.954198,d761e9f22aca4be5390b4c28dc5853556549f07b,US,AB,0,Home,hold
2,743.500000,739.625000,690.791667,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto
3,741.857143,737.523810,737.523810,76adfc0b71b53cf860f44694f636ec41667dde79,US,AL,0,Home,auto
4,764.869565,742.739130,684.347826,9ccbfcdbdcd3a68e564b0babfc4d52d1b811f4e1,US,AL,0,Home,auto
...,...,...,...,...,...,...,...,...,...
6693,690.016667,690.000000,640.000000,1ff406ea8c28b72928f3cf6cac96b99a35bac63f,US,WA,120,Home,auto
6694,731.263158,705.631579,654.578947,748db8c77fb545f1dacdea126b94bb0fa01e7ba3,US,WA,120,Home,hold
6695,732.950000,720.650000,720.000000,6d578aaea9293ac052c317e1dedd3f6f62dc49a4,US,WI,120,Home,hold
6696,772.595238,770.357143,718.738095,e01ca0ca0d8f419572cc1f541e47f3e89b61e40a,US,WI,120,Home,hold


In [55]:
# See all the unique names for states

summer_2020.ProvinceState.unique()

array(['AB', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN',
       'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY',
       'OH', 'OK', 'OR', 'PA', 'PR', 'QC', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ME', 'HI', 'VI', 'AK',
       'NB', 'YT', nan], dtype=object)

In [56]:
# Remove weird states

summer_2020 = summer_2020[summer_2020["ProvinceState"] != "AB"]
summer_2020 = summer_2020[summer_2020["ProvinceState"] != "VI"]
summer_2020 = summer_2020[summer_2020["ProvinceState"] != "ON"]
summer_2020 = summer_2020[summer_2020["ProvinceState"] != "NB"]
summer_2020 = summer_2020[summer_2020["ProvinceState"] != "YT"]
summer_2020 = summer_2020[summer_2020["ProvinceState"] != "QC"]

In [57]:
# See all the unique names for states

summer_2020.ProvinceState.unique()

array(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA',
       'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO',
       'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
       'VT', 'WA', 'WI', 'WV', 'WY', 'ME', 'HI', 'AK', nan], dtype=object)

In [58]:
# Drop NANs

summer_2020 = summer_2020.dropna()

In [59]:
# Add summer solstic date

summer_2020["Date"] = "06/21/2020"

summer_2020

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
2,743.500000,739.625000,690.791667,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto,06/21/2020
3,741.857143,737.523810,737.523810,76adfc0b71b53cf860f44694f636ec41667dde79,US,AL,0,Home,auto,06/21/2020
4,764.869565,742.739130,684.347826,9ccbfcdbdcd3a68e564b0babfc4d52d1b811f4e1,US,AL,0,Home,auto,06/21/2020
5,802.000000,798.909091,763.000000,059c2332368620c4e331260e53eade3074c31ddd,US,AL,0,Home,auto,06/21/2020
6,764.890173,777.606936,776.653179,a69542da0d49750ffd1bebb47e2a80e61f1b32e7,US,AL,0,Home,auto,06/21/2020
...,...,...,...,...,...,...,...,...,...,...
6693,690.016667,690.000000,640.000000,1ff406ea8c28b72928f3cf6cac96b99a35bac63f,US,WA,120,Home,auto,06/21/2020
6694,731.263158,705.631579,654.578947,748db8c77fb545f1dacdea126b94bb0fa01e7ba3,US,WA,120,Home,hold,06/21/2020
6695,732.950000,720.650000,720.000000,6d578aaea9293ac052c317e1dedd3f6f62dc49a4,US,WI,120,Home,hold,06/21/2020
6696,772.595238,770.357143,718.738095,e01ca0ca0d8f419572cc1f541e47f3e89b61e40a,US,WI,120,Home,hold,06/21/2020


# 2019 Winter

In [60]:
# Create variable for files in directory
files = [f for f in os.listdir("data/2020/winter/") if f.endswith(".csv")]

# files

In [61]:
# https://stackoverflow.com/questions/63886787/how-to-create-a-dataframe-from-multiple-csv-files
# Initialize and empty dataframe
winter_2020 = pd.DataFrame()

# Iterate through files and contents, then concatenate the data into the dataframe
for file in files:
    df = pd.read_csv("data/2020/winter/" + file)
    winter_2020 = pd.concat([winter_2020, df])
    
winter_2020

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent
0,722.216216,739.243243,739.243243,a79c42312360f12bf571e48a6dff8c0795a086c1,US,AB,0,Home,hold
1,688.575758,691.818182,690.636364,0404ace1bcdf88d917bd860a04cd6c66fb51679f,US,AK,0,Home,hold
2,645.000000,749.571429,691.285714,f67e6f30dd6870bbdc77462463dc6bf05169b855,US,AL,0,Home,auto
3,676.057143,677.123810,677.123810,6e0c846955b44158b1df0c9d8bc3369c2c008b54,US,AL,0,Home,auto
4,660.533835,752.556391,619.954887,1147542975a299d62794f420f6f2c2010c9afe24,US,AL,0,Home,auto
...,...,...,...,...,...,...,...,...,...
6432,603.270833,650.000000,610.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto
6433,687.761111,740.000000,687.150000,1ff406ea8c28b72928f3cf6cac96b99a35bac63f,US,WA,120,Home,auto
6434,677.553763,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,auto
6435,693.846154,707.538462,703.076923,75910578db7bd09526fdb3c6b2faf03d04402f83,US,WI,120,Home,hold


In [62]:
# See all the unique names for states

winter_2020.ProvinceState.unique()

array(['AB', 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
       'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD',
       'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ',
       'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'VI', 'QC',
       'NB', 'YT', nan], dtype=object)

In [63]:
# Remove weird states

winter_2020 = winter_2020[winter_2020["ProvinceState"] != "AB"]
winter_2020 = winter_2020[winter_2020["ProvinceState"] != "VI"]
winter_2020 = winter_2020[winter_2020["ProvinceState"] != "ON"]
winter_2020 = winter_2020[winter_2020["ProvinceState"] != "NB"]
winter_2020 = winter_2020[winter_2020["ProvinceState"] != "YT"]
winter_2020 = winter_2020[winter_2020["ProvinceState"] != "QC"]

In [64]:
# See all the unique names for states

winter_2020.ProvinceState.unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', nan], dtype=object)

In [65]:
# Drop NANs

winter_2020 = winter_2020.dropna()

In [66]:
# Add summer solstic date

winter_2020["Date"] = "12/21/2020"

winter_2020

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
1,688.575758,691.818182,690.636364,0404ace1bcdf88d917bd860a04cd6c66fb51679f,US,AK,0,Home,hold,12/21/2020
2,645.000000,749.571429,691.285714,f67e6f30dd6870bbdc77462463dc6bf05169b855,US,AL,0,Home,auto,12/21/2020
3,676.057143,677.123810,677.123810,6e0c846955b44158b1df0c9d8bc3369c2c008b54,US,AL,0,Home,auto,12/21/2020
4,660.533835,752.556391,619.954887,1147542975a299d62794f420f6f2c2010c9afe24,US,AL,0,Home,auto,12/21/2020
5,710.612613,710.054054,700.072072,07a3125c24838cfe7e7a21bc65da7672e22f1582,US,AL,0,Home,auto,12/21/2020
...,...,...,...,...,...,...,...,...,...,...
6432,603.270833,650.000000,610.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto,12/21/2020
6433,687.761111,740.000000,687.150000,1ff406ea8c28b72928f3cf6cac96b99a35bac63f,US,WA,120,Home,auto,12/21/2020
6434,677.553763,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,auto,12/21/2020
6435,693.846154,707.538462,703.076923,75910578db7bd09526fdb3c6b2faf03d04402f83,US,WI,120,Home,hold,12/21/2020


In [67]:
All2020 = summer_2020.append(winter_2020, ignore_index=True)

In [68]:
All2020

Unnamed: 0,AvgTempCtrl,AvgCool,AvgHeat,Identifier,Country,ProvinceState,Age_of_Home__years_,Climate,CalendarEvent,Date
0,743.500000,739.625000,690.791667,7736970d2c42b1ba08f14dd51e23c5eda7fc74a2,US,AL,0,Home,auto,06/21/2020
1,741.857143,737.523810,737.523810,76adfc0b71b53cf860f44694f636ec41667dde79,US,AL,0,Home,auto,06/21/2020
2,764.869565,742.739130,684.347826,9ccbfcdbdcd3a68e564b0babfc4d52d1b811f4e1,US,AL,0,Home,auto,06/21/2020
3,802.000000,798.909091,763.000000,059c2332368620c4e331260e53eade3074c31ddd,US,AL,0,Home,auto,06/21/2020
4,764.890173,777.606936,776.653179,a69542da0d49750ffd1bebb47e2a80e61f1b32e7,US,AL,0,Home,auto,06/21/2020
...,...,...,...,...,...,...,...,...,...,...
84170,603.270833,650.000000,610.000000,2a41fbabce7e2de5b4604808f90017db24700f6e,US,VT,120,Home,auto,12/21/2020
84171,687.761111,740.000000,687.150000,1ff406ea8c28b72928f3cf6cac96b99a35bac63f,US,WA,120,Home,auto,12/21/2020
84172,677.553763,680.000000,680.000000,31fff6bad552eba4c1b4550a33e64174aa430ff4,US,WI,120,Home,auto,12/21/2020
84173,693.846154,707.538462,703.076923,75910578db7bd09526fdb3c6b2faf03d04402f83,US,WI,120,Home,hold,12/21/2020


In [69]:
# Export  file to csv

All2020.to_csv("Scraper_Output/2020-US.csv", header=True, index=False)