## Data Collection Code

In [7]:
import urllib3
import csv
import json
import pandas as pd
import datetime
import requests
requests.packages.urllib3.disable_warnings()

#### getWeather()
Takes set of locations, date and time as input and returns weather info for each point in time.

In [8]:
API_KEY = '6FF9G8N2T2NZCSR2C2WPZ42QF'
endpoint = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history'
desired_features = ['temp','visibility','wspd','cloudcover','precip','sealevelpressure','dew','wgust','humidity']

def getWeather(latitudes,longitudes,datetime):
    query_params = '&contentType=json&unitGroup=uk&aggregateMinutes=15&key={}&startDateTime={}&endDateTime={}&locations={},{}'
    query_params=query_params.format(API_KEY, datetime.isoformat(), datetime.isoformat(),latitudes,longitudes)
    # http = urllib3.PoolManager()
    try:
        r = requests.get(endpoint + "?" + query_params,verify = False)
        weatherInfo = r.json()
        # print(weatherInfo)
    except Exception:
        weatherInfo = []
        print('Problem loading weather data for query params' + query_params +"\n"+ Exception)
    if weatherInfo:
        if 'errorCode' in weatherInfo:
            print("Error")
        else:
            for locationid in weatherInfo['locations']: 
                for value in weatherInfo['locations'][locationid]['values']:
                    result = {key: value[key] for key in desired_features}
            return result

    else:
        print('error')

getWeather(55.61055739940275,-4.015384591727852,datetime.datetime(2019,10,12,12,0,0))


{'columns': {'wdir': {'id': 'wdir', 'name': 'Wind Direction', 'type': 2, 'unit': None}, 'latitude': {'id': 'latitude', 'name': 'Latitude', 'type': 2, 'unit': None}, 'cloudcover': {'id': 'cloudcover', 'name': 'Cloud Cover', 'type': 2, 'unit': '%'}, 'mint': {'id': 'mint', 'name': 'Minimum Temperature', 'type': 2, 'unit': 'degC'}, 'datetime': {'id': 'datetime', 'name': 'Date time', 'type': 3, 'unit': None}, 'precip': {'id': 'precip', 'name': 'Precipitation', 'type': 2, 'unit': 'mm'}, 'solarradiation': {'id': 'solarradiation', 'name': 'Solar Radiation', 'type': 2, 'unit': 'W/m^2'}, 'dew': {'id': 'dew', 'name': 'Dew Point', 'type': 2, 'unit': 'degC'}, 'humidity': {'id': 'humidity', 'name': 'Relative Humidity', 'type': 2, 'unit': '%'}, 'precipcover': {'id': 'precipcover', 'name': 'Precipitation Cover', 'type': 2, 'unit': '%'}, 'longitude': {'id': 'longitude', 'name': 'Longitude', 'type': 2, 'unit': None}, 'info': {'id': 'info', 'name': 'Info', 'type': 1, 'unit': None}, 'temp': {'id': 'temp',

{'temp': 11.3,
 'visibility': 36.5,
 'wspd': 9.5,
 'cloudcover': 55.2,
 'precip': 0.0,
 'sealevelpressure': 1002.2,
 'dew': 8.6,
 'wgust': 6.9,
 'humidity': 83.34}

#### getOutput()
Takes generator ID, date and time as input and returns the actual amount generated

In [None]:
endpoint = 'https://api.bmreports.com/BMRS/B1610/V2'
API_KEY_GEN = '90nkfffj5r9qtbh'

def getOutput(id,date):
    dateStart = date.strftime("%Y-%m-%d")
    dateEnd = (date + datetime.timedelta(minutes=40)).strftime("%Y-%m-%d %H:%M")
    query_params = 'APIKey={}&SettlementDate={}&Period={}&NGCBMUnitID={}&ServiceType=csv'
    query_params=query_params.format(API_KEY_GEN, dateStart,25,id)
    # try:
    response = urllib3.request.urlopen(endpoint + "?" + query_params)
    lines = [l.decode('utf-8') for l in response.readlines()]
    return(lines[-1].split(',')[-1])


#### Building dataset
Loops across locations dataset, for each location, collecting weather & generation data for 5 different dates and times spread across a year with a mix of times of day.
Aggregates all into one dataset, with attributes being weather, and the targets <i>y</i> being (actual generation)/(generation capacity)

In [6]:
dateTimes = [datetime.datetime(2019,10,12,12,0,0)]

locations = pd.read_csv('locations.csv')
dataRows = []


for location in locations.iterrows():
    for time in dateTimes:
        dataPoint = getWeather(location[1]['latitude'],location[1]['longitude'],time)
        # dataPoint['loadFactor'] = getOutput(location[1]['BMUID'],time)/location[1]['capacity']
        dataRows.append(dataPoint)
        pass

# dataset attributes: locationID, temp,visibility,wspd,cloudcover,precip,sealevelpressue,dew,humidity,wgust     | loadfactor 0-1
dataset = pd.DataFrame(dataRows,columns=['BMUID','temp','wspd','wgust','sealevelpressure','visibility','cloudcover','precip','dew','humidity'])

print(dataset)

dataset.to_csv('dataset.csv')






    BMUID  temp  wspd  wgust  sealevelpressue  visibility  cloudcover  precip  \
0     NaN  11.3   9.5    6.9              NaN        36.5        55.2     0.0   
1     NaN  10.5  11.1   12.1              NaN        18.3        80.6     0.0   
2     NaN  11.3   7.7    9.0              NaN        19.4        55.0     0.0   
3     NaN  11.5  13.1    NaN              NaN        26.4        40.7     0.0   
4     NaN  12.2  10.9   12.5              NaN        17.6        39.3     0.0   
..    ...   ...   ...    ...              ...         ...         ...     ...   
94    NaN  11.6  13.1    NaN              NaN        25.6        39.6     0.0   
95    NaN  11.3  13.3    NaN              NaN        28.9        43.6     0.0   
96    NaN  12.4   9.5   14.1              NaN        26.5        56.3     0.0   
97    NaN  11.8  11.9    NaN              NaN        27.6        52.5     0.0   
98    NaN  11.7  13.0    NaN              NaN        24.3        37.8     0.0   

    dew  humidity  
0   8.6