In [1]:
%load_ext autoreload
%autoreload 2

In [84]:
from datetime import datetime
from functools import partial

import polars as pl
import altair as alt
import numpy as np
from numpy.random import multivariate_normal
from scipy.interpolate import CubicSpline

from weather.helpers.epw_read import read_epw
from weather.helpers.weather_data import PALO_ALTO_20
from weather.helpers.filter import filter_df_by_month



In [48]:
df = read_epw(PALO_ALTO_20.path)
month_filter = partial(filter_df_by_month, df, PALO_ALTO_20)
june = month_filter(6).filter(pl.col("datetime").dt.day() != 30)
assert (june["datetime"].dt.date().unique_counts().unique() == 24).all()

In [49]:
june.head()

datetime,Dry Bulb Temperature,Dew Point Temperature,Relative Humidity,Extraterrestrial Horizontal Radiation,Extraterrestrial Direct Normal Radiation,Horizontal Infrared Radiation Intensity,Global Horizontal Radiation,Direct Normal Radiation,Diffuse Horizontal Radiation,Global Horizontal Illuminance,Direct Normal Illuminance,Diffuse Horizontal Illuminance,Zenith Luminance,Wind Direction,Wind Speed,Total Sky Cover,Opaque Sky Cover (used if Horizontal IR Intensity missing),Visibility,Ceiling Height,Present Weather Observation,Precipitable Water,Aerosol Optical Depth,Snow Depth,Days Since Last Snowfall,Albedo,Liquid Precipitation Depth,Liquid Precipitation Quantity
datetime[μs],f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,f64,i64,i64,i64,f64,i64,i64,f64,f64,f64
2020-06-01 00:00:00,16.0,7.8,58,0,0,339,0,0,0,0,0,0,0,350,2.9,5,5,16.1,77777,9,160,0.0,0,88,999.0,0.0,1.0
2020-06-01 01:00:00,15.8,8.5,62,0,0,339,0,0,0,0,0,0,0,350,2.7,5,5,16.1,77777,9,170,0.0,0,88,999.0,0.0,1.0
2020-06-01 02:00:00,15.5,9.2,66,0,0,339,0,0,0,0,0,0,0,350,2.4,5,5,16.1,77777,9,179,0.0,0,88,999.0,0.0,1.0
2020-06-01 03:00:00,15.3,10.0,71,0,0,339,0,0,0,0,0,0,0,350,2.2,5,5,16.1,77777,9,189,0.0,0,88,999.0,0.0,1.0
2020-06-01 04:00:00,15.1,10.7,75,3,192,339,1,0,1,147,0,231,6,350,1.9,5,5,16.1,77777,9,200,0.0,0,88,999.0,0.0,1.0


In [82]:
hours = [0, 6, 12, 18, 23] # TODO apply! 

def create_time_df(df):
    def filter_hour(hour):
        return df.filter(pl.col("datetime").dt.hour() == hour)["Dry Bulb Temperature"]

    return pl.DataFrame().with_columns(
        begin = filter_hour(0),
        am = filter_hour(6),
        noon = filter_hour(12),
        pm = filter_hour(18),
        end = filter_hour(23)
    )

In [53]:
tdf = create_time_df(june).with_row_index("day in month")
tdf.head()

day in month,begin,am,noon,pm,end
u32,f64,f64,f64,f64,f64
0,16.0,16.2,22.2,19.9,18.6
1,18.3,18.2,28.2,25.4,22.9
2,22.4,21.2,29.4,25.4,21.6
3,20.8,18.2,27.9,23.0,19.8
4,19.2,17.1,19.0,16.6,16.1


In [None]:
alt.Chart(tdf).transform_fold(
tdf.columns[1:],
).mark_line().encode(
    x='day in month:Q',
    y='value:Q',
    color=alt.Color('key:N').sort(tdf.columns[1:])
)


In [63]:
alt.Chart(tdf).transform_fold(
tdf.columns[1:],
).mark_bar().encode(
    x='value:Q',
    y='count()',
    row=alt.Row("key:N").sort(tdf.columns[1:])
).properties(
    width=300, height=50
)


In [76]:
tdfo = tdf.drop("day in month")

tdfo.mean().to_numpy().flatten()
tdfo.corr().to_numpy()

array([[1.        , 0.78259065, 0.57257922, 0.56273579, 0.49017695],
       [0.78259065, 1.        , 0.53465627, 0.54844945, 0.50046856],
       [0.57257922, 0.53465627, 1.        , 0.95322279, 0.88801313],
       [0.56273579, 0.54844945, 0.95322279, 1.        , 0.96749648],
       [0.49017695, 0.50046856, 0.88801313, 0.96749648, 1.        ]])

In [98]:
samples = multivariate_normal(tdfo.mean().to_numpy().flatten(), tdfo.corr().to_numpy(), size=10)
samples

array([[18.34126089, 17.90347305, 23.41704726, 19.03601232, 17.44896397],
       [16.1306236 , 16.54716142, 22.34777658, 17.90338084, 16.19329307],
       [18.0998365 , 17.11716936, 25.56716004, 20.76798848, 19.05225536],
       [18.39477289, 17.71353755, 24.87100073, 20.46113913, 18.58425083],
       [16.65235891, 16.0382873 , 22.01273836, 17.9950282 , 16.58760088],
       [15.33222581, 16.26691805, 23.18211163, 18.7893017 , 17.06503278],
       [17.78677733, 17.46985602, 23.88825561, 19.44639213, 17.54110795],
       [16.01367072, 15.34522051, 23.47834345, 19.18324936, 17.69565821],
       [17.39272418, 17.65951666, 24.36859047, 19.94043542, 18.00532467],
       [17.32053975, 15.33893381, 23.97223903, 19.65354724, 17.64984968]])

In [112]:
times = np.arange(0, 24, 1)
fits = [CubicSpline(np.array(hours), sample)(times) for sample in samples]


In [113]:
samples_df = pl.DataFrame(data=fits).insert_column(0, pl.Series("time", times)).unpivot(index="time")
samples_df

time,variable,value
i64,str,f64
0,"""column_0""",18.341261
1,"""column_0""",16.868289
2,"""column_0""",16.098842
3,"""column_0""",15.925279
4,"""column_0""",16.239958
…,…,…
19,"""column_9""",18.56351
20,"""column_9""",17.695276
21,"""column_9""",17.17169
22,"""column_9""",17.1156


In [115]:
alt.Chart(samples_df).mark_line().encode(
    x='time:Q',
    y=alt.Y('value:Q').scale(zero=False),
    color=alt.Color("variable:O")
)