# Exploratory data analysis of DarkSky dataset.

In [56]:
import pandas as pd
import os
import json

In [57]:
# Folder where the DarkSky json data is located
DATAFOLDER = '../01.Original_data/DarkSky/'

## Load all the json data into one dataframe

In [58]:
# What json daily files are in the folder
files = [f'{DATAFOLDER}{item}' for item in os.listdir(DATAFOLDER) if item.endswith('json')]
n_files = len(files)

# Dataframe to hold all the DarkSky data
df = pd.DataFrame()

# loop through the files and append to pandas dataframe
for file in files:
    # load json file
    with open(f'{file}') as f:
        d = json.load(f)
        f.close()
    # append to df if daily in keys
    if 'daily' in d.keys():
        df = df.append(d['daily']['data'])

n_imported = len(df)

print(f"Imported {n_imported} datapoints. {n_files-n_imported} had no daily entry.")

Imported 922 datapoints. 7 had no daily entry.


In [59]:
# Only 7 missing entries, lets see if we can identify them based on the times
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 922 entries, 0 to 0
Data columns (total 45 columns):
apparentTemperatureHigh        922 non-null float64
apparentTemperatureHighTime    922 non-null int64
apparentTemperatureLow         839 non-null float64
apparentTemperatureLowTime     839 non-null float64
apparentTemperatureMax         917 non-null float64
apparentTemperatureMaxTime     917 non-null float64
apparentTemperatureMin         917 non-null float64
apparentTemperatureMinTime     917 non-null float64
cloudCover                     740 non-null float64
dewPoint                       917 non-null float64
humidity                       917 non-null float64
icon                           784 non-null object
moonPhase                      922 non-null float64
ozone                          607 non-null float64
precipAccumulation             57 non-null float64
precipIntensity                839 non-null float64
precipIntensityMax             839 non-null float64
precipIntensityMa

## Convert times into timestamps

Times returned by DarkSky are in UNIX timestamp. This is a problem for direct conversion, because the pandas sereies needs to be timezone aware. We can deal with this by adding the timezone information and converting to the respective timezone for all the timestamp objects.

In [61]:
# columns with time information
cols_time = [c for c in df.columns if ('Time' in c) or ('time' in c)]

for col in cols_time:
    df[col] = pd.to_datetime(df[col], unit='s').dt.tz_localize('utc').dt.tz_convert('Europe/Zurich')
    
# order the dataframe by time
df = df.sort_values(by='time')

In [62]:
df.head()

Unnamed: 0,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,cloudCover,dewPoint,...,temperatureMinError,temperatureMinTime,time,uvIndex,uvIndexTime,visibility,windBearing,windGust,windGustTime,windSpeed
0,15.92,2017-10-01 16:13:00+02:00,9.71,2017-10-01 23:42:00+02:00,15.92,2017-10-01 16:13:00+02:00,8.87,2017-10-01 08:20:00+02:00,,9.98,...,,2017-10-01 08:20:00+02:00,2017-10-01 00:00:00+02:00,4.0,2017-10-01 13:45:00+02:00,9.186,247.0,4.61,2017-10-01 13:17:00+02:00,0.37
0,17.15,2017-10-02 16:46:00+02:00,12.7,2017-10-03 08:00:00+02:00,17.15,2017-10-02 16:46:00+02:00,9.74,2017-10-02 00:00:00+02:00,0.81,11.77,...,,2017-10-02 00:00:00+02:00,2017-10-02 00:00:00+02:00,3.0,2017-10-02 13:17:00+02:00,9.992,229.0,8.87,2017-10-02 21:57:00+02:00,0.61
0,16.14,2017-10-03 17:00:00+02:00,5.25,2017-10-04 04:29:00+02:00,17.67,2017-10-03 04:39:00+02:00,7.78,2017-10-03 23:00:00+02:00,0.76,11.01,...,,2017-10-03 23:00:00+02:00,2017-10-03 00:00:00+02:00,4.0,2017-10-03 13:00:00+02:00,9.858,269.0,10.01,2017-10-03 05:06:00+02:00,1.0
0,15.47,2017-10-04 17:24:00+02:00,4.75,2017-10-05 03:25:00+02:00,15.47,2017-10-04 17:24:00+02:00,5.25,2017-10-04 04:29:00+02:00,,6.59,...,,2017-10-04 04:29:00+02:00,2017-10-04 00:00:00+02:00,0.0,2017-10-04 00:00:00+02:00,9.991,230.0,4.62,2017-10-04 15:58:00+02:00,0.24
0,19.81,2017-10-05 15:09:00+02:00,5.68,2017-10-06 07:52:00+02:00,19.81,2017-10-05 15:09:00+02:00,4.75,2017-10-05 03:25:00+02:00,,6.29,...,,2017-10-05 03:25:00+02:00,2017-10-05 00:00:00+02:00,1.0,2017-10-05 16:00:00+02:00,9.837,233.0,9.64,2017-10-05 15:03:00+02:00,0.94


In [63]:
# save the data
fname = 'data_time_converted.csv'
df.to_csv(f"../02.Prepared_data/DarkSky/{fname}")

## Choose columns to use for analysis

For our goal, the prediction of solar production output, we would consider data on temperature, sun, and precipitation relevant. The time information is to coarse, sometimes the maxima are reported during the night hours, or on the next/pervious day, which might give a wrong picture. We should drop the time informations for now, maybe re-visit this in a later stage.

In [100]:
df.columns.tolist()

['apparentTemperatureHigh',
 'apparentTemperatureHighTime',
 'apparentTemperatureLow',
 'apparentTemperatureLowTime',
 'apparentTemperatureMax',
 'apparentTemperatureMaxTime',
 'apparentTemperatureMin',
 'apparentTemperatureMinTime',
 'cloudCover',
 'precipAccumulation',
 'precipIntensity',
 'precipIntensityMax',
 'precipProbability',
 'precipType',
 'sunriseTime',
 'sunsetTime',
 'temperatureHigh',
 'temperatureHighTime',
 'temperatureLow',
 'temperatureLowTime',
 'temperatureMax',
 'temperatureMaxTime',
 'temperatureMin',
 'temperatureMinTime',
 'time',
 'uvIndex']

In [101]:
cols = ['apparentTemperatureHigh',
 'apparentTemperatureLow',
 'apparentTemperatureMax',
 'apparentTemperatureMin',
 'cloudCover',
 'precipAccumulation',
 'precipIntensity',
 'precipIntensityMax',
 'precipProbability',
 'precipType',
 'sunriseTime',
 'sunsetTime',
 'temperatureHigh',
 'temperatureLow',
 'temperatureMax',
 'temperatureMin',
 'time',
 'uvIndex']

df = df[cols] 

`apparentTemperatureMax` is the maximum temperature of that day, `apparentTemperatureHigh` is the maximum temperature during the **daytime** of that day. Same holds for the minimum. However, if we look at the 'times' of these temperatures, we see that this is inconsistent for the minima, the time is in some cases on the previous or the next day.

In [103]:
df

Unnamed: 0,apparentTemperatureHigh,apparentTemperatureLow,apparentTemperatureMax,apparentTemperatureMin,cloudCover,precipAccumulation,precipIntensity,precipIntensityMax,precipProbability,precipType,sunriseTime,sunsetTime,temperatureHigh,temperatureLow,temperatureMax,temperatureMin,time,uvIndex
0,15.92,9.71,15.92,8.87,,,0.0131,0.1962,0.76,rain,2017-10-01 07:29:00+02:00,2017-10-01 19:09:00+02:00,16.20,9.44,16.20,8.60,2017-10-01 00:00:00+02:00,4.0
0,17.15,12.70,17.15,9.74,0.81,,0.0280,0.2159,0.81,rain,2017-10-02 07:30:00+02:00,2017-10-02 19:07:00+02:00,17.43,12.43,17.43,9.47,2017-10-02 00:00:00+02:00,3.0
0,16.14,5.25,17.67,7.78,0.76,,1.0012,9.2915,0.99,rain,2017-10-03 07:32:00+02:00,2017-10-03 19:05:00+02:00,16.42,4.98,17.89,7.51,2017-10-03 00:00:00+02:00,4.0
0,15.47,4.75,15.47,5.25,,,0.0075,0.1589,0.62,rain,2017-10-04 07:33:00+02:00,2017-10-04 19:03:00+02:00,15.75,4.48,15.75,4.98,2017-10-04 00:00:00+02:00,0.0
0,19.81,5.68,19.81,4.75,,,0.3074,3.2280,0.99,rain,2017-10-05 07:34:00+02:00,2017-10-05 19:01:00+02:00,20.09,5.41,20.09,4.48,2017-10-05 00:00:00+02:00,1.0
0,11.35,5.64,11.35,5.68,0.77,,0.1512,2.0875,0.99,rain,2017-10-06 07:36:00+02:00,2017-10-06 18:59:00+02:00,11.63,5.37,11.63,5.41,2017-10-06 00:00:00+02:00,3.0
0,13.94,8.22,13.94,5.64,,,0.0061,0.1859,0.76,rain,2017-10-07 07:37:00+02:00,2017-10-07 18:57:00+02:00,14.22,7.95,14.22,5.37,2017-10-07 00:00:00+02:00,0.0
0,12.81,8.80,12.81,8.22,0.88,,0.3646,2.1444,0.99,rain,2017-10-08 07:39:00+02:00,2017-10-08 18:55:00+02:00,13.09,8.53,13.09,7.95,2017-10-08 00:00:00+02:00,3.0
0,13.94,5.67,13.94,8.25,0.82,,0.0607,0.3676,0.89,rain,2017-10-09 07:40:00+02:00,2017-10-09 18:53:00+02:00,14.22,5.40,14.22,7.98,2017-10-09 00:00:00+02:00,3.0
0,15.63,8.14,15.63,5.67,0.82,,0.0076,0.0891,0.62,rain,2017-10-10 07:42:00+02:00,2017-10-10 18:51:00+02:00,15.91,7.87,15.91,5.40,2017-10-10 00:00:00+02:00,3.0


In [95]:
df.loc[~(df['apparentTemperatureMax'] == df['apparentTemperatureHigh'])].loc[:, ['time', 'apparentTemperatureMax', 'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureMaxTime']]

Unnamed: 0,time,apparentTemperatureMax,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureMaxTime
0,2017-10-03 00:00:00+02:00,17.67,16.14,2017-10-03 17:00:00+02:00,2017-10-03 04:39:00+02:00
0,2017-10-22 00:00:00+02:00,11.47,11.17,2017-10-22 12:08:00+02:00,2017-10-22 03:00:00+02:00
0,2017-10-27 00:00:00+02:00,13.74,13.66,2017-10-27 07:00:00+02:00,2017-10-27 06:32:00+02:00
0,2017-11-05 00:00:00+01:00,12.34,10.79,2017-11-05 08:41:00+01:00,2017-11-05 02:07:00+01:00
0,2017-11-07 00:00:00+01:00,5.04,4.82,2017-11-07 19:00:00+01:00,2017-11-07 23:00:00+01:00
0,2017-11-10 00:00:00+01:00,6.59,5.39,2017-11-10 18:04:00+01:00,2017-11-10 22:07:00+01:00
0,2017-11-11 00:00:00+01:00,9.32,7.33,2017-11-11 18:09:00+01:00,2017-11-11 23:00:00+01:00
0,2017-11-20 00:00:00+01:00,6.91,5.28,2017-11-20 18:00:00+01:00,2017-11-20 22:59:00+01:00
0,2017-11-25 00:00:00+01:00,10.50,6.95,2017-11-25 07:00:00+01:00,2017-11-25 00:01:00+01:00
0,2017-11-28 00:00:00+01:00,5.45,4.09,2017-11-28 11:09:00+01:00,2017-11-28 02:04:00+01:00


In [99]:
df.loc[:, ['time', 'apparentTemperatureMax', 'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureMaxTime']].iloc[-5:]

Unnamed: 0,time,apparentTemperatureMax,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureMaxTime
0,2020-04-12 00:00:00+02:00,22.79,22.79,2020-04-12 15:47:00+02:00,2020-04-12 15:47:00+02:00
0,2020-04-13 00:00:00+02:00,19.21,19.21,2020-04-13 13:00:00+02:00,2020-04-13 13:00:00+02:00
0,2020-04-14 00:00:00+02:00,13.89,13.89,2020-04-14 15:50:00+02:00,2020-04-14 15:50:00+02:00
0,2020-04-15 00:00:00+02:00,20.09,20.09,2020-04-15 15:54:00+02:00,2020-04-15 15:54:00+02:00
0,2020-04-16 00:00:00+02:00,20.88,20.88,2020-04-16 16:34:00+02:00,2020-04-16 16:34:00+02:00
