In [28]:
from zipfile import ZipFile
import os
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from collections.abc import Generator


In [19]:
uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
zip_path = keras.utils.get_file(origin=uri, fname="jena_climate_2009_2016.csv.zip")
zip_file = ZipFile(zip_path)
cwd = os.getcwd()
csv_path = cwd+"/Data"
zip_file.extractall(path=csv_path)


In [22]:
df = pd.read_csv(csv_path+"/jena_climate_2009_2016.csv")
df.shape

(420551, 15)

In [148]:
tscv = TimeSeriesSplit(n_splits=100,max_train_size=100, test_size=1)
print(tscv)

def fetch_data()-> Generator[tuple[pd.DataFrame,pd.DataFrame], None, None]:
    df = pd.read_csv(csv_path+"/jena_climate_2009_2016.csv")
    train_rows = []
    label_rows = []
    for train, test in tscv.split(df):
        # print(f"len(train) = {len(train)}, last of train = {train[-1]}, test = {test}")
        train_rows.append(add_lags(df.iloc[train]))
        label_rows.append(df.iloc[test].drop(columns=["Date Time"]))
    yield (
        pd.concat(train_rows, axis=0).reset_index(drop=True),
        pd.concat(label_rows, axis=0).reset_index(drop=True)
    )
gen = fetch_data()

TimeSeriesSplit(gap=0, max_train_size=100, n_splits=100, test_size=1)


In [151]:
def add_lags(df:pd.DataFrame)-> pd.DataFrame:
    """
    Each row of df must be a separate time point, which will be transformed
    into a lag. This function will transform a matrix of dim -> n_samples x n_columns
    into a matrix of dim -> 1 x (n_columns*n_lags)
    """
    n_lags = df.shape[0]
    lags=range(0,n_lags)
    appended_lags = []
    for lag in lags: 
        lag_df= df.iloc[[lag]].drop(columns=["Date Time"]).reset_index(drop=True)
        lag_df.columns=[x+"_lag_"+str(n_lags-lag) for x in lag_df.columns]
        appended_lags.append(lag_df)
    return pd.concat(appended_lags, axis=1) # by columns

def fetch_data(source:str,n_observations, num_lags)-> Generator[tuple[pd.DataFrame,pd.DataFrame], None, None]:
    df = pd.read_csv(source)
    tscv = TimeSeriesSplit(n_splits=n_observations,
                           max_train_size=num_lags, test_size=1)
    train_rows = []
    label_rows = []
    for train, test in tscv.split(df):
        # print(f"len(train) = {len(train)}, last of train = {train[-1]}, test = {test}")
        train_rows.append(add_lags(df.iloc[train]))
        label_rows.append(df.iloc[test].drop(columns=["Date Time"]))
    yield (
        pd.concat(train_rows, axis=0).reset_index(drop=True),
        pd.concat(label_rows, axis=0).reset_index(drop=True)
    )

In [155]:
gen=fetch_data(source = csv_path+"/jena_climate_2009_2016.csv",
               n_observations = 1000, num_lags= 10)
train, test = next(gen)

In [156]:
train

Unnamed: 0,p (mbar)_lag_10,T (degC)_lag_10,Tpot (K)_lag_10,Tdew (degC)_lag_10,rh (%)_lag_10,VPmax (mbar)_lag_10,VPact (mbar)_lag_10,VPdef (mbar)_lag_10,sh (g/kg)_lag_10,H2OC (mmol/mol)_lag_10,...,rh (%)_lag_1,VPmax (mbar)_lag_1,VPact (mbar)_lag_1,VPdef (mbar)_lag_1,sh (g/kg)_lag_1,H2OC (mmol/mol)_lag_1,rho (g/m**3)_lag_1,wv (m/s)_lag_1,max. wv (m/s)_lag_1,wd (deg)_lag_1
0,996.38,5.94,279.39,1.85,74.9,9.32,6.98,2.34,4.37,7.01,...,75.30,9.52,7.17,2.35,4.49,7.19,1238.86,3.21,6.09,219.1
1,996.44,5.95,279.40,1.86,74.9,9.33,6.99,2.34,4.37,7.01,...,74.90,9.58,7.17,2.40,4.49,7.20,1238.43,3.77,7.17,216.4
2,996.44,5.87,279.31,1.87,75.4,9.27,6.99,2.28,4.38,7.02,...,75.20,9.57,7.20,2.37,4.50,7.22,1238.73,3.03,5.17,221.7
3,996.38,5.83,279.28,1.88,75.7,9.25,7.00,2.25,4.38,7.03,...,75.80,9.53,7.23,2.31,4.52,7.25,1238.77,4.11,7.09,217.9
4,996.37,5.90,279.35,1.93,75.6,9.29,7.02,2.27,4.40,7.05,...,76.70,9.48,7.27,2.21,4.55,7.30,1238.98,3.79,6.97,220.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1001.00,-2.48,270.60,-6.97,71.1,5.09,3.62,1.47,2.25,3.61,...,72.60,4.56,3.31,1.25,2.06,3.31,1292.41,0.56,1.00,202.6
996,1000.81,-2.48,270.62,-6.95,71.2,5.09,3.62,1.47,2.25,3.62,...,73.10,4.52,3.30,1.22,2.06,3.30,1292.98,0.67,1.52,240.0
997,1000.70,-2.59,270.51,-7.04,71.3,5.04,3.60,1.45,2.24,3.59,...,69.71,4.77,3.32,1.44,2.07,3.32,1289.44,1.14,1.92,234.3
998,1000.65,-2.89,270.22,-7.15,72.3,4.93,3.57,1.37,2.22,3.57,...,67.91,4.84,3.28,1.55,2.05,3.28,1288.39,1.08,2.00,215.2


In [87]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train = train

In [None]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(train,test)#(X, y)
