In [11]:
# do not upload this cell! 
API_PATH = '/Users/hkromer/01_Projects/10.SolarAnlage/01.Analytics/solarAnalytics/2020-01-14.ETL/01.Original_data/DarkSkyAPI.pw'
LOCATION_PATH = '/Users/hkromer/01_Projects/10.SolarAnlage/01.Analytics/solarAnalytics/2020-01-14.ETL/01.Original_data/location.pw'

# ETL of DarkSky Data

In this project, the extraction of weather data with python3 using the DarkSky API (https://darksky.net/dev) is covered. This data is to be used in forecasting of photovoltaic energy production. Signup on DarkSky is free, as long as one stays below 1000 API calls per 24 hours. 

The way this pipeline will be laid out is that the DarkSky API is called and returns hourly, historical data for each day that is requested. The response from the API in the form of json files will be stored locally and then be processed into one big csv file with all the data from the API. For the final purpose of this dataset, the forecasting of photovoltaic energy production, not all of the data in DarkSky must be relevant. 

## Import

In [14]:
import datetime
import pandas as pd
import requests
import re
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [86]:
location = [item.strip() for item in open(LOCATION_PATH, 'r').readlines()[0].strip().split()]
DarkSkyAPI = open(API_PATH, 'r').readlines()[0].strip()

date_str = '2018-08-11'

ds = DarkSkyETL(LOCATION, DarkSkyAPI,'../01.Original_data/')

a = ds.call_API(date_str, silent=False)

{'latitude': 47.96194, 'longitude': 7.951151, 'timezone': 'Europe/Berlin', 'flags': {'sources': ['cmc', 'gfs', 'icon', 'isd', 'madis'], 'nearest-station': 4.169, 'units': 'si'}, 'offset': 2}


KeyError: 'hourly'

24

In [None]:
# date range
startDate = '2017-10-01'
endDate = '2019-06-29'

# convert to a time delta
d_from_date = datetime.datetime.strptime(startDate, '%Y-%m-%d')
d_to_date = datetime.datetime.strptime(endDate , '%Y-%m-%d')
delta = d_to_date - d_from_date

# for every day in the date range, get the data
for i in range(delta.days+1):
    
    new_date = (d_from_date + datetime.timedelta(days=i)) + pd.DateOffset(hours=12)

    # convert to datetime
    new_date = new_date.to_pydatetime()  

    # convert the date to UNIX time
    unix_time = int(datetime.datetime.timestamp(new_date))

    print(unix_time)
    print(new_date)

In [None]:
start_date = '2017-10-01'
end_date = '2020-01-13'
save_path = '../01.Original_data/'
silent = False

ds = DarkSkyETL(LOCATION, DarkSkyAPI, save_path)

ds.get_date_range(start_date, end_date, silent)

In [85]:
class DarkSkyETL():
    
    def __init__(self, DarkSkyAPI, location, save_path):
        self.API = DarkSkyAPI
        self.LOCATION = location
        self.SAVE_PATH = save_path # where to store output json files. will be stored as save_pathYYYY-MM-DD.json
        
    def call_API(self, date_str, silent=False):
        """
        Calls the DarkSky API for the specified location at the specified date, which must be a string in the
        format YYYY-00-DD. Asks for hourly data and returns a json object for this date.
        Prints the number of hours loaded if silent is True.
        """
        # option list for API
        option_list = "exclude=currently,minutely,alerts&units=si"
        
        # convert to a timestamp at mid-day
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d') + pd.DateOffset(hours=12)
        
        # convert the date to unit time
        unix_time = int(datetime.datetime.timestamp(date))
        
        # call API
        latitude = LOCATION[0]
        longitude = LOCATION[1]
        response = requests.get("https://api.darksky.net/forecast/"+DarkSkyAPI+"/"+latitude+","+longitude+","+str(unix_time)+"?"+option_list)
        json_res = response.json()

        # write to file
        with open(f'{self.SAVE_PATH}/{date_str}.json', 'w') as outfile:  
            json.dump(json_res, outfile)
            outfile.close()

        if silent == False:
            # some dates have no data
            if 'hourly' in json_res.keys():
                datapoints = len(json_res['hourly']['data'])
                print(f"Loaded {datapoints} datapoints for {date_str}")
            else:
                print(f"hourly not in keys for {date_str}")
            
        return json_res
    
    def get_date_range(self, start_date, end_date, silent=False):
        """
        Gets a list of dates in steps of one day between start_date and end_date. Calls the API and saves the json return file at the location save_path.
        INPUT:
            - start_date: STRING, date to start
            - end_date: STRING, date to end the list
            - silent: BOOLEAN, False, if the number of datapoints on each date should be printed out
        OUTPUT:
            None
        """
        
        # convert to a time delta
        d_from_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        d_to_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
        delta = d_to_date - d_from_date
        
        # for every day in the date range, get the data
        for i in range(delta.days+1):
            date = str((d_from_date + datetime.timedelta(days=i)).date())
            print(date)
            self.call_API(date, silent)