In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from place_groups import place_groups_middle_of_roads, place_groups_junctions

from sklearn.linear_model import LinearRegression
from datetime import timedelta
from keras.callbacks import ModelCheckpoint
import numpy as np
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import pandas as pd
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers import BatchNormalization
from keras.models import Sequential
from keras import regularizers

from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16, 10)

# prevent tensorflow from allocating the entire GPU memory at once
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

Using TensorFlow backend.


In [3]:
HDF5_PREFIX = 'only_speed_lstm_with_dropout'

In [4]:
HDF5_DIR = os.path.join('.', 'hdf5')
ONE_DAY_LAGS = 288
DATA_PATH = '/mnt/sdc1/inon/norrecampus/data/by_place_5min'
EXAMPLE_PLACE_ID = 'ChIJZaR1M1hSUkYRxP0WkwYYy_k'

THRESHOLD_HIGH_OUTLIER = 110
LOSS = 'mse'
MINI_BATCH_SIZE = 512
NUM_EPOCHS = 100
VALIDATION_SPLIT = 0.2
NUM_LAGS = 12
LSTM_STATE_SIZE = NUM_LAGS
NUM_OUTS_MEAN_SPEED_REGRESSION = 1
SPLIT_DATE = '2015-06-01'

In [5]:
def split_timestamp(ser):
    return pd\
        .DataFrame(ser)\
        .assign(day_of_week=lambda df: df.index.dayofweek, time_of_day=lambda df: df.index.time)\
        .reset_index()

In [6]:
class LearnerForMeanSpeedOfOneSegment(object):
    def __init__(self,
                 dropout_factor,
                 threshold_high_outlier,
                 loss,
                 place_id, 
                 mini_batch_size,
                 num_epochs,
                 validation_split,
                 num_lags,
                 lstm_state_size,
                 num_outs,
                 split_date):
        self.df = None
        self.model = None
        
        self.dropout_factor = dropout_factor
        self.threshold_high_outlier = threshold_high_outlier
        self.loss = loss
        self.place_id = place_id
        self.mini_batch_size = mini_batch_size
        self.num_epochs = num_epochs
        self.validation_split = validation_split
        self.num_lags = num_lags
        self.lstm_state_size = lstm_state_size
        self.num_outs = num_outs
        self.split_date = split_date
        self.checkpoint_basename = os.path.join(HDF5_DIR, '%s_%.2f_%s.best.hdf5' % 
                                                (HDF5_PREFIX, self.dropout_factor, self.place_id))
    
    def compute_error_statistics(self, errors_df):
        abs_errors = errors_df.error.abs()
        abs_errors_normalized = abs_errors / errors_df.speed_km_hr_true
        return pd.DataFrame({
            'corr': np.corrcoef(errors_df.speed_km_hr_predicted, errors_df.speed_km_hr_true)[0, 1],
            'mae': np.mean(abs_errors),
            'mape': np.mean(abs_errors_normalized),
            'mse': np.mean(abs_errors ** 2),
            'msne': np.mean(abs_errors_normalized ** 2),
            'rae': np.sum(abs_errors) / np.sum(np.abs(errors_df.speed_km_hr_true - np.mean(errors_df.speed_km_hr_true))),
            'rmse': np.sqrt(np.mean(abs_errors ** 2)),
            'rmsne': np.sqrt(np.mean(abs_errors_normalized ** 2)),
            'r2': max(0, 1 - np.sum(abs_errors ** 2) / np.sum((errors_df.speed_km_hr_true - np.mean(errors_df.speed_km_hr_true)) ** 2))
        }, index=[self.place_id])
    
    def baseline_lr(self):
        X_train_normalized, Y_train_normalized, X_test_normalized, Y_test_normalized = self._get_train_and_test_inputs()
        trained_lr = LinearRegression(fit_intercept=False).fit(X_train_normalized.values, Y_train_normalized.values)
        lr_predictions_normalized = trained_lr.predict(X_test_normalized)
        errors_df = self._individual_errors_without_interpolated_values(lr_predictions_normalized)
        return self.compute_error_statistics(errors_df)
        
    def _get_detrend_factors(self):
        ser_imp = self._imputation_for_missing_values_and_outliers(self._get_df()).speed_km_hr
        df_train_period = ser_imp[lambda df: df.index < self.split_date]
        return df_train_period\
            .groupby([df_train_period.index.dayofweek, df_train_period.index.time])\
            .agg(['mean', 'std'])\
            .reset_index()\
            .rename(columns={'level_0': 'day_of_week', 'level_1': 'time_of_day'})
        
    def detrend(self, ser):
        return pd\
            .merge(left=split_timestamp(ser), 
                   right=self._get_detrend_factors(),
                   on=['day_of_week', 'time_of_day'], 
                   how='inner')\
                .assign(speed_normalized=lambda df: (df.speed_km_hr - df['mean']) / df['std'])\
                .set_index('index')\
                .speed_normalized\
                .sort_index()

    def _map_back(self, detrended_series):
        return pd\
            .merge(left=split_timestamp(detrended_series), 
                   right=self._get_detrend_factors(),
                   on=['day_of_week', 'time_of_day'], 
                   how='inner')\
            .assign(speed_km_hr=lambda df: (df.speed_normalized * df['std']) + df['mean'])\
            .set_index('index')\
            .speed_km_hr\
            .sort_index()
    
    def _imputation_for_missing_values_and_outliers(self, ser):
        missing_timestamps = pd\
            .date_range(min(ser.index.date), max(ser.index.date) + timedelta(days=1), freq='5T')\
            .difference(ser.index)
        df_with_nans_where_missing = pd.DataFrame(ser)\
            .join(pd.DataFrame(index=missing_timestamps), how='outer')\
            .assign(original_value=lambda df: df.iloc[:, 0])
        return df_with_nans_where_missing\
            .iloc[:, 0]\
            .mask(lambda df: df > self.threshold_high_outlier)\
            .interpolate()\
            .to_frame()\
            .assign(original_value=df_with_nans_where_missing.original_value)\
            .assign(is_interpolated=lambda df: df.iloc[:, 0] != df.iloc[:, 1])
        
    def _get_df(self):
        if self.df is None:
            self.df = pd.read_csv(
                os.path.join(DATA_PATH, self.place_id + '.csv'),
                parse_dates=['start_interval_s', 'end_interval_s']\
            )[lambda df: df.start_interval_s >= '2015-01-01']\
            .rename(columns={'start_interval_s': 't', 'speed_mean': 'speed_km_hr'})\
            .set_index('t')\
            .speed_km_hr
        return self.df

    def _get_train_and_test_inputs(self):
        speeds_interpolated_and_detrended = self.detrend(
            self._imputation_for_missing_values_and_outliers(self._get_df()).speed_km_hr)
        lags = pd.concat([speeds_interpolated_and_detrended.shift(x) for x in range(self.num_lags + 1)], axis=1)[self.num_lags:]
        train = lags[lags.index < self.split_date]
        X_train = train.iloc[:, 1:]
        Y_train = train.iloc[:, 0]
        test = lags[lags.index >= self.split_date]
        X_test = test.iloc[:, 1:]
        Y_test = test.iloc[:, 0]
        return X_train, Y_train, X_test, Y_test

    def _build_model(self):
        # TODO: better weights initialization
        self.model = Sequential()
        self.model.add(LSTM(self.lstm_state_size, input_shape=(self.num_lags, 1), return_sequences=False))
        self.model.add(Dropout(self.dropout_factor))
        #model.add(Dense(units=1, kernel_regularizer=regularizers.l2(0.00001)))
        self.model.add(Dense(units=self.num_outs, activation="linear"))  # Linear activation, because speed RESIDUALS can have any sign.
        self.model.compile(loss=self.loss, optimizer="rmsprop")  # TODO: try adam optimizer too, although rmsprop is the default go-to for RNN
    
    def load_best_model_from_disk(self):
        self._build_model()
        self.model.load_weights(self.checkpoint_basename)
    
    def create_and_train_model(self):
        self._build_model()
        X_train, Y_train, _, _ = self._get_train_and_test_inputs()
        self.model.fit(
            np.expand_dims(np.array(X_train.values), 2),
            Y_train.values,
            batch_size=self.mini_batch_size,
            epochs=self.num_epochs,
            validation_split=self.validation_split,
            # checkpoint best model
            callbacks=[ModelCheckpoint(
                self.checkpoint_basename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')],
            verbose=2)
        self.model.load_weights(self.checkpoint_basename)
    
    def predict(self):
        _, _, X_test_normalized, Y_test_normalized = self._get_train_and_test_inputs()
        predictions_normalized = self.model.predict(np.expand_dims(X_test_normalized, 2)).flatten()
        errors_df = self._individual_errors_without_interpolated_values(predictions_normalized)
        return self.compute_error_statistics(errors_df)
        
    def _impute(self):
        raise NotImplementedError
        
    def _individual_errors_without_interpolated_values(self, predictions_normalized):
        _, _, _, Y_test_normalized = self._get_train_and_test_inputs()
        ser_Y_test = self._map_back(Y_test_normalized)
        ser_predictions = self._map_back(pd.Series(
            predictions_normalized, index=Y_test_normalized.index, name='speed_normalized'))
        interpolated_timestamps = self\
            ._imputation_for_missing_values_and_outliers(self._get_df())\
            [lambda df: df.is_interpolated]\
            .index
        return ser_Y_test.to_frame()\
            .join(ser_predictions.to_frame(), lsuffix='_true', rsuffix='_predicted')\
            .loc[lambda df: df.index.difference(interpolated_timestamps)]\
            .assign(error=lambda df: df.speed_km_hr_true - df.speed_km_hr_predicted)

In [7]:
def one_place_improvement_over_baseline_lr(place_id, dropout_factor):
    learner = LearnerForMeanSpeedOfOneSegment(
        dropout_factor=dropout_factor,
        threshold_high_outlier=THRESHOLD_HIGH_OUTLIER,
        loss=LOSS,
        place_id=place_id, 
        mini_batch_size=MINI_BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        validation_split=VALIDATION_SPLIT,
        num_lags=NUM_LAGS,
        lstm_state_size=LSTM_STATE_SIZE,
        num_outs=NUM_OUTS_MEAN_SPEED_REGRESSION,
        split_date=SPLIT_DATE)
    learner.create_and_train_model()
    learner.load_best_model_from_disk()
    return learner.predict() - learner.baseline_lr()

In [8]:
# one_place_improvement_over_baseline_lr(EXAMPLE_PLACE_ID, 0.2)

In [None]:
def multiple_places(place_group, place_group_name):
    places = [lst[len(lst) // 2] for lst in place_group]
    for dropout_factor in [0.25, 0.5, 0.75]:
        return pd.concat(map(lambda place_id: one_place_improvement_over_baseline_lr(place_id, dropout_factor), places))\
            .to_csv('%s_dropout_%.2f_improvement_over_baseline_lr_v2.csv' % (dropout_factor, place_group_name))

In [None]:
multiple_places(place_groups_middle_of_roads, 'place_groups_middle_of_roads')

Train on 34780 samples, validate on 8696 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.77488, saving model to ./hdf5/only_speed_lstm_with_dropout_0.25_ChIJ7feGfFdSUkYRmmsyva5Yh7g.best.hdf5
 - 2s - loss: 0.9475 - val_loss: 0.7749
Epoch 2/100
Epoch 00002: val_loss improved from 0.77488 to 0.66157, saving model to ./hdf5/only_speed_lstm_with_dropout_0.25_ChIJ7feGfFdSUkYRmmsyva5Yh7g.best.hdf5
 - 1s - loss: 0.8445 - val_loss: 0.6616
Epoch 3/100
Epoch 00003: val_loss improved from 0.66157 to 0.55578, saving model to ./hdf5/only_speed_lstm_with_dropout_0.25_ChIJ7feGfFdSUkYRmmsyva5Yh7g.best.hdf5
 - 1s - loss: 0.7181 - val_loss: 0.5558
Epoch 4/100
Epoch 00004: val_loss improved from 0.55578 to 0.47879, saving model to ./hdf5/only_speed_lstm_with_dropout_0.25_ChIJ7feGfFdSUkYRmmsyva5Yh7g.best.hdf5
 - 1s - loss: 0.6205 - val_loss: 0.4788
Epoch 5/100
Epoch 00005: val_loss improved from 0.47879 to 0.44697, saving model to ./hdf5/only_speed_lstm_with_dropout_0.25_ChIJ7feGfFdSUkYRmm

In [None]:
multiple_places(place_groups_junctions, 'place_groups_junctions')