# Exploratory data analysis of DarkSky dataset.

In [145]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [146]:
# Folder where the DarkSky json data is located
DATAFOLDER = '../01.Original_data/DarkSky/'

## Load all the json data into one dataframe

In [147]:
# What json daily files are in the folder
files = [f'{DATAFOLDER}{item}' for item in os.listdir(DATAFOLDER) if item.endswith('json')]
n_files = len(files)

# Dataframe to hold all the DarkSky data
df = pd.DataFrame()

# loop through the files and append to pandas dataframe
for file in files:
    # load json file
    with open(f'{file}') as f:
        d = json.load(f)
        f.close()
    # append to df if daily in keys
    if 'daily' in d.keys():
        df = df.append(d['daily']['data'])

n_imported = len(df)

print(f"Imported {n_imported} datapoints. {n_files-n_imported} had no daily entry.")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Imported 922 datapoints. 7 had no daily entry.


In [148]:
# Only 7 missing entries, lets see if we can identify them based on the times
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 922 entries, 0 to 0
Data columns (total 45 columns):
apparentTemperatureHigh        922 non-null float64
apparentTemperatureHighTime    922 non-null int64
apparentTemperatureLow         839 non-null float64
apparentTemperatureLowTime     839 non-null float64
apparentTemperatureMax         917 non-null float64
apparentTemperatureMaxTime     917 non-null float64
apparentTemperatureMin         917 non-null float64
apparentTemperatureMinTime     917 non-null float64
cloudCover                     740 non-null float64
dewPoint                       917 non-null float64
humidity                       917 non-null float64
icon                           784 non-null object
moonPhase                      922 non-null float64
ozone                          607 non-null float64
precipAccumulation             57 non-null float64
precipIntensity                839 non-null float64
precipIntensityMax             839 non-null float64
precipIntensityMa

## Convert times into timestamps

Times returned by DarkSky are in UNIX timestamp. This is a problem for direct conversion, because the pandas sereies needs to be timezone aware. We can deal with this by adding the timezone information and converting to the respective timezone for all the timestamp objects.

In [149]:
# columns with time information
cols_time = [c for c in df.columns if ('Time' in c) or ('time' in c)]

for col in cols_time:
    df[col] = pd.to_datetime(df[col], unit='s').dt.tz_localize('utc').dt.tz_convert('Europe/Zurich')
    
# order the dataframe by time
df = df.sort_values(by='time')

In [150]:
df.head()

Unnamed: 0,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,cloudCover,dewPoint,...,temperatureMinError,temperatureMinTime,time,uvIndex,uvIndexTime,visibility,windBearing,windGust,windGustTime,windSpeed
0,15.92,2017-10-01 16:13:00+02:00,9.71,2017-10-01 23:42:00+02:00,15.92,2017-10-01 16:13:00+02:00,8.87,2017-10-01 08:20:00+02:00,,9.98,...,,2017-10-01 08:20:00+02:00,2017-10-01 00:00:00+02:00,4.0,2017-10-01 13:45:00+02:00,9.186,247.0,4.61,2017-10-01 13:17:00+02:00,0.37
0,17.15,2017-10-02 16:46:00+02:00,12.7,2017-10-03 08:00:00+02:00,17.15,2017-10-02 16:46:00+02:00,9.74,2017-10-02 00:00:00+02:00,0.81,11.77,...,,2017-10-02 00:00:00+02:00,2017-10-02 00:00:00+02:00,3.0,2017-10-02 13:17:00+02:00,9.992,229.0,8.87,2017-10-02 21:57:00+02:00,0.61
0,16.14,2017-10-03 17:00:00+02:00,5.25,2017-10-04 04:29:00+02:00,17.67,2017-10-03 04:39:00+02:00,7.78,2017-10-03 23:00:00+02:00,0.76,11.01,...,,2017-10-03 23:00:00+02:00,2017-10-03 00:00:00+02:00,4.0,2017-10-03 13:00:00+02:00,9.858,269.0,10.01,2017-10-03 05:06:00+02:00,1.0
0,15.47,2017-10-04 17:24:00+02:00,4.75,2017-10-05 03:25:00+02:00,15.47,2017-10-04 17:24:00+02:00,5.25,2017-10-04 04:29:00+02:00,,6.59,...,,2017-10-04 04:29:00+02:00,2017-10-04 00:00:00+02:00,0.0,2017-10-04 00:00:00+02:00,9.991,230.0,4.62,2017-10-04 15:58:00+02:00,0.24
0,19.81,2017-10-05 15:09:00+02:00,5.68,2017-10-06 07:52:00+02:00,19.81,2017-10-05 15:09:00+02:00,4.75,2017-10-05 03:25:00+02:00,,6.29,...,,2017-10-05 03:25:00+02:00,2017-10-05 00:00:00+02:00,1.0,2017-10-05 16:00:00+02:00,9.837,233.0,9.64,2017-10-05 15:03:00+02:00,0.94


In [151]:
# save the data
fname = 'data_time_converted.csv'
df.to_csv(f"../02.Prepared_data/DarkSky/{fname}")

## Choose columns to use for analysis

For our goal, the prediction of solar production output, we would consider data on temperature, sun, and precipitation relevant. The time information is to coarse, sometimes the maxima are reported during the night hours, or on the next/pervious day, which might give a wrong picture. We should drop the time informations for now, maybe re-visit this in a later stage.

In [152]:
df.columns.tolist()

['apparentTemperatureHigh',
 'apparentTemperatureHighTime',
 'apparentTemperatureLow',
 'apparentTemperatureLowTime',
 'apparentTemperatureMax',
 'apparentTemperatureMaxTime',
 'apparentTemperatureMin',
 'apparentTemperatureMinTime',
 'cloudCover',
 'dewPoint',
 'humidity',
 'icon',
 'moonPhase',
 'ozone',
 'precipAccumulation',
 'precipIntensity',
 'precipIntensityMax',
 'precipIntensityMaxError',
 'precipIntensityMaxTime',
 'precipProbability',
 'precipType',
 'pressure',
 'summary',
 'sunriseTime',
 'sunsetTime',
 'temperatureHigh',
 'temperatureHighError',
 'temperatureHighTime',
 'temperatureLow',
 'temperatureLowError',
 'temperatureLowTime',
 'temperatureMax',
 'temperatureMaxError',
 'temperatureMaxTime',
 'temperatureMin',
 'temperatureMinError',
 'temperatureMinTime',
 'time',
 'uvIndex',
 'uvIndexTime',
 'visibility',
 'windBearing',
 'windGust',
 'windGustTime',
 'windSpeed']

In [153]:
cols = ['apparentTemperatureHigh',
 'apparentTemperatureLow',
 'cloudCover',
 'precipIntensityMax',
 'precipProbability',
 'precipType',
 'sunriseTime',
 'sunsetTime',
 'temperatureHigh',
 'time',
 'uvIndex']

df = df[cols] 

### Unit conversions

In [154]:
# inch to cm
inch_to_cm = 2.54
df['precipIntensityMax_cm'] = df['precipIntensityMax'] * inch_to_cm

### Plots

#### Temperature

In [155]:
%matplotlib widget
df = df.reset_index(drop=True)

fig, ax = plt.subplots(figsize=(8,5))

X = df['time']

# apparent 
col = 'apparentTemperatureHigh'
Y = df[col].values
plt.plot(X, Y, label=col)

col = 'temperatureHigh'
Y = df[col].values
plt.plot(X, Y, label=col)


ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y %b'))
plt.xticks(rotation=45)
plt.xlabel('Date')
plt.ylabel('Temperature in degree celcius')
plt.legend()
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

What we see here as a takeaway is that the temperature follows a seasonal trend. The (high) temperatures are low in the winter and high in the summer. It is also rarely below 0 degC. It must be kept in mind that this is not the mean or median temperature, but the daytime high temperature!

#### Precipitation

In [156]:
%matplotlib widget
df = df.reset_index(drop=True)

fig, ax = plt.subplots(figsize=(9,5))

X = df['time']


col = 'precipIntensityMax_cm'
Y = df[col].values
plt.plot(X, Y, label=col)


ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y %b'))
plt.xticks(rotation=45)
plt.xlabel('Date')
plt.ylabel('Precipitation intensity in cm of liquid water per hour')
plt.legend()
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<div class="alert alert-block alert-danger">
First of all we see that there are missing values for the precipitation from April 2018 until July 2018. Also there is an extreme value compared to the rest in June 2019 with more than 70 cm per hour of water. Let us first investigate this outlier .
    </div>

#### Outlier June 2019 precipitation intensity

In [157]:
df.loc[:, ['time', 'precipIntensityMax_cm', 'precipProbability', 'precipType']].loc[df['precipIntensityMax_cm'] == df['precipIntensityMax_cm'].max()]

Unnamed: 0,time,precipIntensityMax_cm,precipProbability,precipType
615,2019-06-15 00:00:00+02:00,71.308722,0.92,rain


There is no exceptional for that day in the historical weather. More confusingly, there was no rain recorded that day. It must be an error and will hence be removed and replaced with the mean of the 10 days around that date.

In [158]:
# replace with np.nan
df.loc[df['time'].dt.date == pd.to_datetime("2019-06-15"), 'precipIntensityMax_cm'] = np.nan

In [159]:
date_min = pd.to_datetime("2019-06-10")
date_max = pd.to_datetime("2019-06-20")
mean = np.mean(df.loc[:, ['precipIntensityMax_cm']].loc[(df['time'].dt.date >= date_min) & (df['time'].dt.date <= date_max)]).values[0]
df.loc[df['time'].dt.date == pd.to_datetime("2019-06-15"), 'precipIntensityMax_cm'] = mean

In [160]:
df.loc[df['time'].dt.date == pd.to_datetime("2019-06-15"), 'precipIntensityMax_cm']

615    2.158187
Name: precipIntensityMax_cm, dtype: float64

In [161]:
%matplotlib widget
df = df.reset_index(drop=True)

fig, ax = plt.subplots(figsize=(9,5))

X = df['time']


col = 'precipIntensityMax_cm'
Y = df[col].values
plt.plot(X, Y, label=col)


ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y %b'))
plt.xticks(rotation=45)
plt.xlabel('Date')
plt.ylabel('Precipitation intensity in cm of liquid water per hour')
plt.legend()
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Let's now focus on the missing values.

In [None]:
# set index as time
df.set_index('time', inplace=True, drop=True)

In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 922 entries, 2017-10-01 00:00:00+02:00 to 2020-04-16 00:00:00+02:00
Data columns (total 11 columns):
apparentTemperatureHigh    922 non-null float64
apparentTemperatureLow     839 non-null float64
cloudCover                 740 non-null float64
precipIntensityMax         839 non-null float64
precipProbability          837 non-null float64
precipType                 780 non-null object
sunriseTime                922 non-null datetime64[ns, Europe/Zurich]
sunsetTime                 922 non-null datetime64[ns, Europe/Zurich]
temperatureHigh            922 non-null float64
uvIndex                    889 non-null float64
precipIntensityMax_cm      839 non-null float64
dtypes: datetime64[ns, Europe/Zurich](2), float64(8), object(1)
memory usage: 126.4+ KB


In [166]:
df['2018-05-01':'2018-08-01'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 87 entries, 2018-05-01 00:00:00+02:00 to 2018-08-01 00:00:00+02:00
Data columns (total 11 columns):
apparentTemperatureHigh    87 non-null float64
apparentTemperatureLow     19 non-null float64
cloudCover                 16 non-null float64
precipIntensityMax         20 non-null float64
precipProbability          19 non-null float64
precipType                 16 non-null object
sunriseTime                87 non-null datetime64[ns, Europe/Zurich]
sunsetTime                 87 non-null datetime64[ns, Europe/Zurich]
temperatureHigh            87 non-null float64
uvIndex                    54 non-null float64
precipIntensityMax_cm      20 non-null float64
dtypes: datetime64[ns, Europe/Zurich](2), float64(8), object(1)
memory usage: 8.2+ KB


In [142]:
df.loc[:, ['precipIntensityMax_cm']].loc[(df['time'].dt.date == pd.to_datetime("2019-06-15"))]

Unnamed: 0,precipIntensityMax_cm
615,2.158187


<div class="alert alert-block alert-info">
    <b>Columns (from the DarkSky API):</b>
</div>

- `apparentTemperatureHigh`: The daytime high apparent (feels-like) temperature.
- `temperatureHigh`: The daytime high temperature.
    - these two temperatures are correlated, but we will have to see if the apparent or normally measured temperature is a better predictor for the solar energy production
- `precipIntensity`: The intensity (in inches of liquid water per hour) of precipitation occurring at the given time. This value is conditional on probability (that is, assuming any precipitation occurs at all).
- `cloudCover`: 
 'precipAccumulation',
 'precipIntensity',
 'precipIntensityMax',
 'precipProbability',
 'precipType',
 'sunriseTime',
 'sunsetTime',
 'temperatureHigh',
 'time',
 'uvIndex'


In [12]:
df.loc[:, ['time', 'apparentTemperatureMax', 'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureMaxTime']].iloc[-5:]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,time,apparentTemperatureMax,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureMaxTime
917,2020-04-12 00:00:00+02:00,22.79,22.79,,
918,2020-04-13 00:00:00+02:00,19.21,19.21,,
919,2020-04-14 00:00:00+02:00,13.89,13.89,,
920,2020-04-15 00:00:00+02:00,20.09,20.09,,
921,2020-04-16 00:00:00+02:00,20.88,20.88,,


<div class="alert alert-block alert-info">
<b>Tip:</b> Use blue boxes (alert-info) for tips and notes. 
If it’s a note, you don’t have to include the word “Note”.
</div>