In [1]:
import os
import math
import sys
import importlib

import numpy as np

import pandas as pd

import sklearn
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from scipy.stats import norm

import keras
from keras import backend as bkend
from keras.datasets import cifar10, mnist
from keras.layers import Dense, BatchNormalization, Dropout, Flatten, convolutional, pooling
from keras import metrics

import tensorflow as tf
from tensorflow.python.client import device_lib

import matplotlib.pyplot as plt

data_dir = os.getcwd()
fname = os.path.join(data_dir, "jena_climate_2009_2016.csv")

f = open(fname)
data = f.read()
f.close()

lines = data.split("\n")
header = lines[0].split(",")
lines = lines[1:]

float_data = np.zeros((len(lines), len(header) - 1))
for i, line in enumerate(lines):
    values = [float(x) for x in line.split(",")[1:]]
    float_data[i, :] = values

mean = float_data[:20000].mean(axis=0)
float_data -= mean
std = float_data[:20000].std(axis=0)
float_data /= std    
    
def generator(data,
              lookback,
              delay,
              min_index,
              max_index,
              shuffle=False,
              batch_size=25,
              step=6):
    if max_index is None:
        max_index = len(data) - delay - 1
    
    i = min_index + lookback
  
    while 1:
        if shuffle:
            rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
      
        samples = np.zeros((len(rows), lookback // step, data.shape[-1]))
        targets = np.zeros((len(rows),))
    
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]

        return samples, targets

lookback = 360 # We use 60 hours for our training window: 360 * 10 / 60 = 60 hours.
step = 6 # Each row represents hourly measurements: 6 * 10 / 60 = 1 hour.
delay = 144 # We predict the temperature 1 day ahead: 144 * 10 / 60 = 24 hours.

train_gen = generator(data=float_data,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=200000,
                      shuffle=True,
                      step=step,
                      batch_size=1000)

test_gen = generator(data=float_data,
                     lookback=lookback,
                     delay=delay,
                     min_index=300001,
                     max_index=None,
                     shuffle=False,
                     step=step,
                     batch_size=1000)

pipe_base = Pipeline(steps=[("model", RandomForestRegressor())])

pipe_base = pipe_base.fit(X=np.reshape(train_gen[0], [train_gen[0].shape[0], train_gen[0].shape[1] * train_gen[0].shape[2]]),
                          y=train_gen[1])

print("The MSE score for the meteorology regression task without autoencoders: %.6f." % sklearn.metrics.mean_squared_error(y_pred=pipe_base.predict(X=np.reshape(test_gen[0], [test_gen[0].shape[0], test_gen[0].shape[1] * test_gen[0].shape[2]])), y_true=test_gen[1]))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


The MSE score for the meteorology regression task without autoencoders: 0.133229.


# References

1. Chollet, F. (2018). Deep Learning with Python (Manning).
2. https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d