In [94]:
# dependencies
import pandas as pd
from sqlalchemy import create_engine
from datetime import timedelta

In [95]:
# setup connection to postgres SQL db
connection_string = "postgres:postgres@localhost:5432/solar_weather_db"
engine = create_engine(f'postgresql://{connection_string}')

# Confirm tables
engine.table_names()

['weather', 'solar_ogilvie', 'solar_minneapolis']

In [96]:
# import data from db
weather_df = pd.read_sql('select * from weather', con=engine)
olg_df = pd.read_sql('select * from solar_ogilvie', con=engine)
mpls_df = pd.read_sql('select * from solar_minneapolis', con=engine)

In [97]:
# rename date_time columns to 'date_time' in each df
olg_df.rename(columns={"og_date_time": "date_time"}, inplace=True)
weather_df.rename(columns={"weather_date_time": "date_time"}, inplace=True)
mpls_df.rename(columns={"mpls_date_time": "date_time"},inplace=True)

In [98]:
# adjust times in mpls data to be on the hour (by adding 15 minutes for first 115 rows)


# row counter
i = 0
# list to store mpls datetimes
mpls_dt = []

# loop through mpls datetimes
for dt in mpls_df.date_time:

    if str(dt)[14:16] == '45':
        
        # add 15 minute timedelta before appending
        fixed_dt = mpls_df.date_time[i] + timedelta(minutes=15)
        mpls_dt.append(fixed_dt)
        
    else:
        mpls_dt.append(dt)
        
    # increment row counter
    i += 1

# replace old datetimes with updated datetimes in df
mpls_df["date_time"] = mpls_dt

mpls_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy
0,2017-05-26 15:00:00,445,111,111
1,2017-05-26 16:00:00,2280,570,681
2,2017-05-26 17:00:00,4186,1047,1728
3,2017-05-26 18:00:00,4283,1071,2799
4,2017-05-26 19:00:00,4043,1011,3809


In [99]:
# convert times from weather from UTC 0 to UTC-6

# list for updated datetimes
weather_dt = []

# loop through times in weather_df and subtract 6 hours
for dt in weather_df.date_time:
    # subtract 6 hours
    new_dt = dt - timedelta(hours=6)
    # append to list of updated datetimes
    weather_dt.append(new_dt)
    
# update weather_df with converted datetimes
weather_df['date_time'] = weather_dt

weather_df.head()

Unnamed: 0,date_time,weather_description,clouds_all,temp_f,rain_1h,snow_1h
0,2017-03-31 18:00:00,scattered clouds,40,56.678,0,0
1,2017-03-31 19:00:00,scattered clouds,40,52.25,0,0
2,2017-03-31 20:00:00,sky is clear,1,47.66,0,0
3,2017-03-31 21:00:00,sky is clear,1,42.206,0,0
4,2017-03-31 22:00:00,sky is clear,1,39.812,0,0


In [100]:
# lists to store summed power_delivered and energy_delivered for each hour
power_hour_summed = []
energy_hour_summed = []
# list to store indices to drop (with timestamps not on the hour-mark)
drop_list = []

# list to store indices of rows to sum
to_sum = []

# loop / row counter
i = 0

# loop through olg data, drop rows, and sum hourly power_delivered
for dt in olg_df.date_time:
    
    # chek remainder of row counter divided by 4
    if i % 4 == 1:   # if == 1, then it's the row with time on the hour-mark
        
        power_sum = sum(olg_df.power_delivered[i-3 : i+1])
        energy_sum = sum(olg_df.energy_delivered[i-3 : i+1])
        
        power_hour_summed.append(power_sum)
        energy_hour_summed.append(energy_sum)

    else:  
        # add row index number to drop list
        drop_list.append(i)
    
    # increment row counter
    i += 1


In [101]:
# drop rows with timestamps not on hour-mark
olg_df.drop(drop_list, axis=0, inplace=True)

# replace 15-min power and energy measurements with hourly sums
olg_df['power_delivered'] = power_hour_summed
olg_df['energy_delivered'] = energy_hour_summed

# drop first row (hourly sum not available)
olg_df.drop(1, axis=0, inplace=True)

# reset_index
olg_df.reset_index(drop=True, inplace=True)

olg_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy
0,2019-04-21 13:00:00,57444,14361,22937
1,2019-04-21 14:00:00,74580,18645,41582
2,2019-04-21 15:00:00,72024,18006,59588
3,2019-04-21 16:00:00,63552,15888,75476
4,2019-04-21 17:00:00,45996,11499,86975


In [102]:
# inner merge 'mpls_df' with 'weather_df'
mpls_weather_df = pd.merge(mpls_df.copy(), weather_df.copy(), how='inner',
                        on='date_time', suffixes=['m_', 'w_'])

mpls_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,rain_1h,snow_1h
0,2017-05-26 15:00:00,445,111,111,sky is clear,1,76.316,0,0
1,2017-05-26 16:00:00,2280,570,681,scattered clouds,40,77.0,0,0
2,2017-05-26 17:00:00,4186,1047,1728,scattered clouds,40,76.874,0,0
3,2017-05-26 18:00:00,4283,1071,2799,sky is clear,1,76.226,0,0
4,2017-05-26 19:00:00,4043,1011,3809,sky is clear,1,74.642,0,0


In [103]:
# inner merge 'olg_df' with 'weather_df'
olg_weather_df = pd.merge(olg_df.copy(), weather_df.copy(), how='inner',
                        on='date_time', suffixes=['o_', 'w_'])

olg_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,rain_1h,snow_1h
0,2019-04-21 13:00:00,57444,14361,22937,broken clouds,75,71.42,0,0
1,2019-04-21 14:00:00,74580,18645,41582,sky is clear,1,72.644,0,0
2,2019-04-21 15:00:00,72024,18006,59588,broken clouds,75,72.572,0,0
3,2019-04-21 16:00:00,63552,15888,75476,broken clouds,75,71.456,0,0
4,2019-04-21 17:00:00,45996,11499,86975,sky is clear,1,68.756,0,0


In [104]:
olg_weather_df.count()

date_time              2582
power_delivered        2582
energy_delivered       2582
cumulative_energy      2582
weather_description    2582
clouds_all             2582
temp_f                 2582
rain_1h                2582
snow_1h                2582
dtype: int64

In [105]:
mpls_weather_df.count()

date_time              19859
power_delivered        19859
energy_delivered       19859
cumulative_energy      19859
weather_description    19859
clouds_all             19859
temp_f                 19859
rain_1h                19859
snow_1h                19859
dtype: int64

In [86]:
# output merged datasets to csv, sans index
mpls_weather_df.to_csv('resources/mpls_solar_weather.csv', index=False)
olg_weather_df.to_csv('resources/olg_solar_weather.csv', index=False)