# Time Series Analysis with Prophet model

In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet

import seaborn as sns
import matplotlib.pyplot as plt

from prophet.plot import plot_plotly, plot_components_plotly

from sklearn.metrics import mean_squared_error

from prophet.diagnostics import performance_metrics

import plotly.io as pio
pio.renderers.default = "notebook"


sns.axes_style("darkgrid")
sns.set_theme()

In [None]:
# import train data and format timestamp column
df = pd.read_csv('../data/cleaned_sensors_dwd_train.csv', index_col=0)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
# import test data
df_test_data = pd.read_csv('../data/cleaned_sensors_dwd_test.csv', index_col=0)
df_test_data['timestamp'] = pd.to_datetime(df_test_data['timestamp'])

In [None]:
df

In [None]:
# reduce data to city Bremen
df_prophet = df.query('city == "Bremen"')
df_prophet = df_prophet[['timestamp','PM2p5','location_id']]

# rename columns to expected format for prophet
df_prophet.rename(columns={'timestamp': 'ds', 'PM2p5': 'y'}, inplace=True)

# prophet can not handle nans in dataframe
df_prophet.dropna(inplace=True)

# create list of location ids for later loop
location_list_bremen = df_prophet['location_id'].unique().tolist()

# check number of locations
df_prophet.location_id.nunique()

## Prophet model for all locations in Bremen (without regressors)

In [None]:
NUMBER_OF_MODELS = len(location_list_bremen)

In [None]:
# create dictionary to later save models
prophet_models_per_location = {}

# loop over location ids
for n, location_id in enumerate(location_list_bremen[:NUMBER_OF_MODELS]):
    # only take first 5 locations for test
        print('-----'*10)
        print(f'no: {n + 1}, location_id: {location_id}')

        # limit dataframe to specific location_id
        df_location = df_prophet[df_prophet['location_id'] == location_id]
        df_location.drop('location_id', axis=1, inplace=True)

        # init Prophet model and fit it to train data for one location
        model = Prophet(yearly_seasonality=True)
        model.fit(df_location)

        # save model in dictionary
        prophet_models_per_location[location_id] = model

In [None]:
# create dictionary to save prediction results
prophet_forecasts_per_location = {}
for location_id in location_list_bremen[:NUMBER_OF_MODELS]:
    print('-----'*10)
    print(f'location_id: {location_id}')

    # load model for current location
    model = prophet_models_per_location[location_id]
    
    # create dataframe for future predictions and predict
    # 1416 = all Jan + Feb 2022
    future = model.make_future_dataframe(periods=1416, freq='H')
    forecast = model.predict(future)

    # save predictions in dataframe
    prophet_forecasts_per_location[location_id] = forecast

In [None]:
# deprecated
def plot_model(model, forecast, file):

    fig = model.plot(forecast, xlabel='Date', ylabel='Value', figsize=(20, 12))
    ax = fig.gca()
    #ax.set_title("Title", size=34)
    ax.set_xlabel("Date", size=34)
    ax.set_ylabel("PM 2.5 in µg/m³", size=34)
    ax.tick_params(axis="x", labelsize=24)
    ax.tick_params(axis="y", labelsize=24)
    ax.set_ylim(0,150)

    fig.savefig(file, bbox_inches='tight', facecolor="#EEEEEE")

    plt.show()

In [None]:
def plot_model_with_future(model, forecast, future, file):

    fig = model.plot(forecast, xlabel='Date', ylabel='Value', figsize=(20, 12))
    ax = fig.gca()
    
    sns.scatterplot(data=future, x='timestamp', y='PM2p5', ax=ax, color='green')
    #ax.set_title("Title", size=34)
    ax.set_xlabel("Date", size=34)
    ax.set_ylabel("PM 2.5 in µg/m³", size=34)
    ax.tick_params(axis="x", labelsize=24)
    ax.tick_params(axis="y", labelsize=24)
    ax.set_ylim(0,150)

    fig.savefig(file, bbox_inches='tight', facecolor="#EEEEEE")

    plt.show()

In [None]:
# plot models
for location_id in location_list_bremen[:NUMBER_OF_MODELS]:
    model = prophet_models_per_location[location_id]
    # limit test data to current location
    df_test_location = df_test_data[df_test_data['location_id'] == location_id]
    plot_model_with_future(model, prophet_forecasts_per_location[location_id], df_test_location, f'../images/prophet_location_id_{location_id}.png')

In [None]:
# plot components of models
for location_id in location_list_bremen[:NUMBER_OF_MODELS]:
    model = prophet_models_per_location[location_id]
    fig = model.plot_components(prophet_forecasts_per_location[location_id], figsize=(20, 12))
    fig.savefig(f'../images/prophet_components_location_id_{location_id}.png', bbox_inches='tight', facecolor="#EEEEEE")

In [None]:
# interactive plot for location_id 125 (this does not work in a loop)
model = prophet_models_per_location[125]
plot_plotly(model, forecast) 

# Prophet with regressors

In [None]:
#df.head()

In [None]:
# preparing dataframe with regressors
df_prophet_reg = df.query('city == "Bremen"')

# first regressor is temperature_dwd
df_prophet_reg = df_prophet_reg[['timestamp','PM2p5','location_id', 'humidity_dwd', 'temperature_dwd', 'pressure_dwd', 'wind_speed', 'precip']]  # 
df_prophet_reg.rename(columns={'timestamp': 'ds', 'PM2p5': 'y', 'humidity_dwd': 'humi', 'temperature_dwd': 'temp', 'pressure_dwd': 'press', 'wind_speed': 'windsp', 'precip': 'precip'}, inplace=True) #

# drop nans
print(df_prophet_reg.isna().sum())
df_prophet_reg.dropna(inplace=True)

location_list_bremen_reg = df_prophet_reg['location_id'].unique().tolist()


In [None]:
df_prophet_reg

In [None]:
NUMBER_OF_MODELS_REG = len(location_list_bremen_reg)

In [None]:
df_prophet_reg

In [None]:
# create dictionary to later save models
prophet_models_per_location_reg = {}

# loop over location ids
for n, location_id in enumerate(location_list_bremen_reg[:NUMBER_OF_MODELS_REG]):
    # only take first 5 locations for test
    print('-----'*10)
    print(f'no: {n + 1}, location_id: {location_id}')

    # limit dataframe to specific location_id
    df_location_reg = df_prophet_reg[df_prophet_reg['location_id'] == location_id]
    df_location_reg.drop('location_id', axis=1, inplace=True)

    # init Prophet model and fit it to train data for one location
    model_reg = Prophet(yearly_seasonality=True, )
    model_reg.add_regressor('temp', standardize=True)
    model_reg.add_regressor('humi', standardize=True)
    model_reg.add_regressor('press', standardize=True)
    model_reg.add_regressor('windsp', standardize=True)
    model_reg.add_regressor('precip', standardize=True)
    model_reg.fit(df_location_reg)

    # save model in dictionary
    prophet_models_per_location_reg[location_id] = model_reg

In [None]:
def create_regressor_column(ds, train_col, test_col):
    """Get a regressor of train or test data for corresponding timestamp

    Args:
        ds (datetime): timestamp
        train_col (string): column name of regressor in train data
        test_col (string): column name of regressor in test data

    Returns:
        float: regressor value for given timestamp
    """
    
    if ds in df_prophet_reg['ds'].values:
        return df_prophet_reg[df_prophet_reg['ds'] == ds][train_col].values[0]
    elif ds in df_test_data['timestamp'].values:
        return df_test_data[df_test_data['timestamp'] == ds][test_col].values[0]
    else:
        return np.nan
   

In [None]:
# create dictionary to save prediction results
prophet_forecasts_per_location_reg = {}
for location_id in location_list_bremen_reg[:NUMBER_OF_MODELS_REG]:
    print('-----'*10)
    print(f'location_id: {location_id}')

    # load model for current location
    model_reg = prophet_models_per_location_reg[location_id]

    # limit test data to current location
    df_test_location = df_test_data[df_test_data['location_id'] == location_id]
    
    # create dataframe for future predictions and predict
    # 1416 = all Jan + Feb 2022
    future_reg = model_reg.make_future_dataframe(periods=1416, freq='H')
    future_reg['temp'] = future_reg['ds'].apply(create_regressor_column, args=('temp', 'temperature_dwd'))
    future_reg['humi'] = future_reg['ds'].apply(create_regressor_column, args=('humi', 'humidity_dwd'))
    future_reg['press'] = future_reg['ds'].apply(create_regressor_column, args=('press', 'pressure_dwd'))
    future_reg['windsp'] = future_reg['ds'].apply(create_regressor_column, args=('windsp', 'wind_speed'))   
    future_reg['precip'] = future_reg['ds'].apply(create_regressor_column, args=('precip', 'precip'))   
    # drop nans
    future_reg.dropna(inplace=True)

    forecast_reg = model_reg.predict(future_reg)

    # save predictions in dataframe
    prophet_forecasts_per_location_reg[location_id] = forecast_reg

In [None]:
# plot models
for location_id in location_list_bremen_reg[:NUMBER_OF_MODELS_REG]:
    model_reg = prophet_models_per_location_reg[location_id]
    # limit test data to current location
    df_test_location = df_test_data[df_test_data['location_id'] == location_id]
    plot_model_with_future(model_reg, prophet_forecasts_per_location_reg[location_id], df_test_location, f'../images/prophet_reg_location_id_{location_id}.png')

In [None]:
# plot components of models
for location_id in location_list_bremen_reg[:NUMBER_OF_MODELS_REG]:
    model_reg = prophet_models_per_location_reg[location_id]
    fig2 = model_reg.plot_components(prophet_forecasts_per_location_reg[location_id], figsize=(20, 12))

    fig2.savefig(f'../images/prophet_components_reg_location_id_{location_id}.png', bbox_inches='tight', facecolor="#EEEEEE")

In [None]:
# interactive plot for location_id 125 (this does not work in a loop) with regressors
model_reg = prophet_models_per_location_reg[125]
plot_plotly(model_reg, prophet_forecasts_per_location_reg[125])


# Calculating RMSE

In [None]:
df_test_data

In [None]:
number_of_hours = 1416

In [None]:
rmse_dict = {}
for location_id in location_list_bremen[:NUMBER_OF_MODELS_REG]:
    # limit columns of test data for calculating rmse
    df_test_location = df_test_data[df_test_data['location_id'] == location_id]
    df_rmse = df_test_location[['timestamp', 'PM2p5']]

    # merge forecast to test data
    df_rmse = df_rmse.merge(prophet_forecasts_per_location[location_id][['yhat', 'ds']], how='left', left_on='timestamp', right_on='ds')
    df_rmse.drop(columns='ds', axis=1, inplace=True)

    df_rmse.dropna(inplace=True)

    # calculate rmse for specific time span
    rmse = mean_squared_error(np.asarray(df_rmse.loc[0:number_of_hours,['PM2p5']]), np.asarray(df_rmse.loc[0:number_of_hours,['yhat']]), squared=False)
    rmse_dict[location_id] = rmse

In [None]:
rmse_dict_reg = {}
for location_id in location_list_bremen[:NUMBER_OF_MODELS_REG]:
    # limit columns of test data for calculating rmse
    df_test_location = df_test_data[df_test_data['location_id'] == location_id]
    df_rmse = df_test_location[['timestamp', 'PM2p5']]

    # merge forecast to test data
    df_rmse = df_rmse.merge(prophet_forecasts_per_location_reg[location_id][['yhat', 'ds']], how='left', left_on='timestamp', right_on='ds')
    df_rmse.drop(columns='ds', axis=1, inplace=True)

    df_rmse.dropna(inplace=True)

    # calculate rmse for specific time span
    rmse = mean_squared_error(np.asarray(df_rmse.loc[0:number_of_hours,['PM2p5']]), np.asarray(df_rmse.loc[0:number_of_hours,['yhat']]), squared=False)
    rmse_dict_reg[location_id] = rmse


In [None]:
rmse_dict_reg

In [None]:
df_rmse = pd.DataFrame.from_dict(data=rmse_dict, orient='index')
df_rmse_reg = pd.DataFrame.from_dict(data=rmse_dict_reg, orient='index')

df_rmse.reset_index(inplace=True)
df_rmse_reg.reset_index(inplace=True)

df_rmse.columns = ['location_id','rmse']
df_rmse_reg.columns = ['location_id','rmse_reg']


In [None]:
df_rmse_all = df_rmse.merge(df_rmse_reg, how='outer', on='location_id')

In [None]:
df_rmse_all.to_csv('../models/prophet_rmse_bremen.csv')