In [175]:
# import numpy, pandas
import pandas as pd
import numpy as np

# to connect to and read data from postgres SQL db
from sqlalchemy import create_engine

# for processing time
from datetime import timedelta

In [176]:
# setup connection to postgres SQL db
connection_string = "postgres:postgres@localhost:5432/solar_weather_db"
engine = create_engine(f'postgresql://{connection_string}')

# Confirm tables
engine.table_names()

['weather', 'solar_ogilvie', 'solar_minneapolis']

In [177]:
# import data from db
weather_df = pd.read_sql('select * from weather', con=engine)
olg_df = pd.read_sql('select * from solar_ogilvie', con=engine)
mpls_df = pd.read_sql('select * from solar_minneapolis', con=engine)

In [178]:
# rename date_time columns to 'date_time' in each df
olg_df.rename(columns={"og_date_time": "date_time"}, inplace=True)
weather_df.rename(columns={"weather_date_time": "date_time"}, inplace=True)
mpls_df.rename(columns={"mpls_date_time": "date_time"},inplace=True)

In [179]:
# adjust times in mpls data to be on the hour (by adding 15 minutes for first 115 rows)


# row counter
i = 0
# list to store mpls datetimes
mpls_dt = []

# loop through mpls datetimes
for dt in mpls_df.date_time:

    if str(dt)[14:16] == '45':
        
        # add 15 minute timedelta before appending
        fixed_dt = mpls_df.date_time[i] + timedelta(minutes=15)
        mpls_dt.append(fixed_dt)
        
    else:
        mpls_dt.append(dt)
        
    # increment row counter
    i += 1

# replace old datetimes with updated datetimes in df
mpls_df["date_time"] = mpls_dt

mpls_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy
0,2017-05-26 15:00:00,445,111,111
1,2017-05-26 16:00:00,2280,570,681
2,2017-05-26 17:00:00,4186,1047,1728
3,2017-05-26 18:00:00,4283,1071,2799
4,2017-05-26 19:00:00,4043,1011,3809


In [180]:
# convert times from weather from UTC 0 to UTC-6

# list for updated datetimes
weather_dt = []

# loop through times in weather_df and subtract 6 hours
for dt in weather_df.date_time:
    # subtract 6 hours
    new_dt = dt - timedelta(hours=6)
    # append to list of updated datetimes
    weather_dt.append(new_dt)
    
# update weather_df with converted datetimes
weather_df['date_time'] = weather_dt

weather_df.head()

Unnamed: 0,date_time,weather_description,clouds_all,temp_f,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,weather_main
0,2017-03-31 18:00:00,scattered clouds,40,56.678,1017,30,3,10,0.0,0.0,Clouds
1,2017-03-31 19:00:00,scattered clouds,40,52.25,1018,34,2,350,0.0,0.0,Clouds
2,2017-03-31 20:00:00,sky is clear,1,47.66,1019,56,2,36,0.0,0.0,Clear
3,2017-03-31 21:00:00,sky is clear,1,42.206,1019,49,2,36,0.0,0.0,Clear
4,2017-03-31 22:00:00,sky is clear,1,39.812,1020,60,2,36,0.0,0.0,Clear


In [181]:
# lists to store summed power_delivered and energy_delivered for each hour
power_hour_summed = []
energy_hour_summed = []
# list to store indices to drop (with timestamps not on the hour-mark)
drop_list = []

# list to store indices of rows to sum
to_sum = []

# loop / row counter
i = 0

# loop through olg data, drop rows, and sum hourly power_delivered
for dt in olg_df.date_time:
    
    # chek remainder of row counter divided by 4
    if i % 4 == 1:   # if == 1, then it's the row with time on the hour-mark
        
        power_sum = sum(olg_df.power_delivered[i-3 : i+1])
        energy_sum = sum(olg_df.energy_delivered[i-3 : i+1])
        
        power_hour_summed.append(power_sum)
        energy_hour_summed.append(energy_sum)

    else:  
        # add row index number to drop list
        drop_list.append(i)
    
    # increment row counter
    i += 1


In [182]:
# drop rows with timestamps not on hour-mark
olg_df.drop(drop_list, axis=0, inplace=True)

# replace 15-min power and energy measurements with hourly sums
olg_df['power_delivered'] = power_hour_summed
olg_df['energy_delivered'] = energy_hour_summed

# drop first row (hourly sum not available)
olg_df.drop(1, axis=0, inplace=True)

# reset_index
olg_df.reset_index(drop=True, inplace=True)

olg_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy
0,2019-04-21 13:00:00,57444,14361,22937
1,2019-04-21 14:00:00,74580,18645,41582
2,2019-04-21 15:00:00,72024,18006,59588
3,2019-04-21 16:00:00,63552,15888,75476
4,2019-04-21 17:00:00,45996,11499,86975


In [183]:
# inner merge 'mpls_df' with 'weather_df'
mpls_weather_df = pd.merge(mpls_df.copy(), weather_df.copy(), how='inner',
                        on='date_time', suffixes=['m_', 'w_'])

mpls_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,weather_main
0,2017-05-26 15:00:00,445,111,111,sky is clear,1,76.316,1007,41,4,250,0.0,0.0,Clear
1,2017-05-26 16:00:00,2280,570,681,scattered clouds,40,77.0,1007,38,4,250,0.0,0.0,Clouds
2,2017-05-26 17:00:00,4186,1047,1728,scattered clouds,40,76.874,1006,36,5,250,0.0,0.0,Clouds
3,2017-05-26 18:00:00,4283,1071,2799,sky is clear,1,76.226,1007,36,4,280,0.0,0.0,Clear
4,2017-05-26 19:00:00,4043,1011,3809,sky is clear,1,74.642,1008,40,3,270,0.0,0.0,Clear


In [184]:
# inner merge 'olg_df' with 'weather_df'
olg_weather_df = pd.merge(olg_df.copy(), weather_df.copy(), how='inner',
                        on='date_time', suffixes=['o_', 'w_'])

olg_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,weather_main
0,2019-04-21 13:00:00,57444,14361,22937,broken clouds,75,71.42,1009,37,4,60,0.0,0.0,Clouds
1,2019-04-21 14:00:00,74580,18645,41582,sky is clear,1,72.644,1008,38,2,110,0.0,0.0,Clear
2,2019-04-21 15:00:00,72024,18006,59588,broken clouds,75,72.572,1009,35,4,40,0.0,0.0,Clouds
3,2019-04-21 16:00:00,63552,15888,75476,broken clouds,75,71.456,1009,40,6,40,0.0,0.0,Clouds
4,2019-04-21 17:00:00,45996,11499,86975,sky is clear,1,68.756,1008,33,5,40,0.0,0.0,Clear


In [185]:
def expand_timestamps(df, ts_column):
    '''Function that accepts a dataframe and string for the timestamp column. It outputs
    lists for hour, day, month. Index must be continuous - gaps with throw off values.'''
    
    # empty lists for outputs
    hours, days, months = [], [], []
    
    # loop over length of timestamp columnm
    for i in range(len(df[ts_column])):
        
        # append minutes, hours, days, months
        hours.append(df[ts_column][i].hour)
        days.append(df[ts_column][i].dayofyear)
        months.append(df[ts_column][i].month)
        
    return hours, days, months

In [186]:
# add columns for hour, day, month
mpls_weather_df['hour'], mpls_weather_df['day_of_year'], mpls_weather_df['month'] = \
                                                expand_timestamps(mpls_weather_df, 'date_time')

olg_weather_df['hour'], olg_weather_df['day_of_year'], olg_weather_df['month'] = \
                                                expand_timestamps(olg_weather_df, 'date_time')

In [187]:
def cyclical_encoding(time_data, time_type):
    '''Function to encode as cyclical a list or Pandas.Series of time elements. Accepts time_data
    and time_type ('hour', 'day_of_year', 'month') and returns list of sine and cosine coordinates
    for each time element as part of a unit circle.'''
    
    # set max_times by time type
    max_time = {
        'hour': 24,
        'month': 12,
        'day_of_year': 365
    }
    
    # compute sin and cos coordinates of unit circle for each element in time_data
    sin_time = np.sin(2 * np.pi * time_data / max_time[time_type])
    cos_time = np.cos(2 * np.pi * time_data / max_time[time_type])
    
    return sin_time, cos_time

In [188]:
# cyclical encoding for hour, day, month
mpls_weather_df['sin_day'], mpls_weather_df['cos_day'] = cyclical_encoding(mpls_weather_df.day_of_year, 'day_of_year')
mpls_weather_df['sin_hour'], mpls_weather_df['cos_hour'] = cyclical_encoding(mpls_weather_df.hour, 'hour')
mpls_weather_df['sin_month'], mpls_weather_df['cos_month'] = cyclical_encoding(mpls_weather_df.month, 'month')

mpls_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,pressure,humidity,wind_speed,...,weather_main,hour,day_of_year,month,sin_day,cos_day,sin_hour,cos_hour,sin_month,cos_month
0,2017-05-26 15:00:00,445,111,111,sky is clear,1,76.316,1007,41,4,...,Clear,15,146,5,0.587785,-0.809017,-0.707107,-0.7071068,0.5,-0.866025
1,2017-05-26 16:00:00,2280,570,681,scattered clouds,40,77.0,1007,38,4,...,Clouds,16,146,5,0.587785,-0.809017,-0.866025,-0.5,0.5,-0.866025
2,2017-05-26 17:00:00,4186,1047,1728,scattered clouds,40,76.874,1006,36,5,...,Clouds,17,146,5,0.587785,-0.809017,-0.965926,-0.258819,0.5,-0.866025
3,2017-05-26 18:00:00,4283,1071,2799,sky is clear,1,76.226,1007,36,4,...,Clear,18,146,5,0.587785,-0.809017,-1.0,-1.83697e-16,0.5,-0.866025
4,2017-05-26 19:00:00,4043,1011,3809,sky is clear,1,74.642,1008,40,3,...,Clear,19,146,5,0.587785,-0.809017,-0.965926,0.258819,0.5,-0.866025


In [189]:
# cyclical encoding for hour, day, month
olg_weather_df['sin_day'], olg_weather_df['cos_day'] = cyclical_encoding(olg_weather_df.day_of_year, 'day_of_year')
olg_weather_df['sin_hour'], olg_weather_df['cos_hour'] = cyclical_encoding(olg_weather_df.hour, 'hour')
olg_weather_df['sin_month'], olg_weather_df['cos_month'] = cyclical_encoding(olg_weather_df.month, 'month')

olg_weather_df.head()

Unnamed: 0,date_time,power_delivered,energy_delivered,cumulative_energy,weather_description,clouds_all,temp_f,pressure,humidity,wind_speed,...,weather_main,hour,day_of_year,month,sin_day,cos_day,sin_hour,cos_hour,sin_month,cos_month
0,2019-04-21 13:00:00,57444,14361,22937,broken clouds,75,71.42,1009,37,4,...,Clouds,13,111,4,0.942761,-0.333469,-0.258819,-0.965926,0.866025,-0.5
1,2019-04-21 14:00:00,74580,18645,41582,sky is clear,1,72.644,1008,38,2,...,Clear,14,111,4,0.942761,-0.333469,-0.5,-0.866025,0.866025,-0.5
2,2019-04-21 15:00:00,72024,18006,59588,broken clouds,75,72.572,1009,35,4,...,Clouds,15,111,4,0.942761,-0.333469,-0.707107,-0.707107,0.866025,-0.5
3,2019-04-21 16:00:00,63552,15888,75476,broken clouds,75,71.456,1009,40,6,...,Clouds,16,111,4,0.942761,-0.333469,-0.866025,-0.5,0.866025,-0.5
4,2019-04-21 17:00:00,45996,11499,86975,sky is clear,1,68.756,1008,33,5,...,Clear,17,111,4,0.942761,-0.333469,-0.965926,-0.258819,0.866025,-0.5


In [190]:
# output merged datasets to csv, sans index
mpls_weather_df.to_csv('resources/mpls_solar_weather.csv', index=False)
olg_weather_df.to_csv('resources/olg_solar_weather.csv', index=False)

In [90]:
# TESTING CODE BELOW:

In [192]:
# columns on which to run ML model
features = [
    'power_delivered', 'clouds_all', 'temp_f', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
    'rain_1h', 'snow_1h', 'weather_main', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day'
]

mpls = (mpls_weather_df[features].copy())
mpls

Unnamed: 0,power_delivered,clouds_all,temp_f,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h,weather_main,sin_hour,cos_hour,sin_day,cos_day
0,445,1,76.316,1007,41,4,250,0.0,0.0,Clear,-7.071068e-01,-7.071068e-01,0.587785,-0.809017
1,2280,40,77.000,1007,38,4,250,0.0,0.0,Clouds,-8.660254e-01,-5.000000e-01,0.587785,-0.809017
2,4186,40,76.874,1006,36,5,250,0.0,0.0,Clouds,-9.659258e-01,-2.588190e-01,0.587785,-0.809017
3,4283,1,76.226,1007,36,4,280,0.0,0.0,Clear,-1.000000e+00,-1.836970e-16,0.587785,-0.809017
4,4043,1,74.642,1008,40,3,270,0.0,0.0,Clear,-9.659258e-01,2.588190e-01,0.587785,-0.809017
5,2917,1,71.492,1008,42,3,260,0.0,0.0,Clear,-8.660254e-01,5.000000e-01,0.587785,-0.809017
6,1216,1,66.992,1008,49,2,270,0.0,0.0,Clear,-7.071068e-01,7.071068e-01,0.587785,-0.809017
7,1863,1,62.960,1009,59,1,280,0.0,0.0,Clear,-5.000000e-01,8.660254e-01,0.587785,-0.809017
8,855,1,61.376,1010,59,3,280,0.0,0.0,Clear,-2.588190e-01,9.659258e-01,0.587785,-0.809017
9,236,1,58.442,1011,59,1,260,0.0,0.0,Clear,0.000000e+00,1.000000e+00,0.573772,-0.819015
