# Time Series Analysis with Prophet model

In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet

import seaborn as sns
import matplotlib.pyplot as plt

from prophet.plot import plot_plotly, plot_components_plotly

from sklearn.metrics import mean_squared_error

import plotly.io as pio
pio.renderers.default = "notebook"

plt.style.use('Solarize_Light2')

In [None]:
# import train data and format timestamp column
df = pd.read_csv('../data/cleaned_sensors_dwd_train.csv', index_col=0)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df

In [None]:
# reduce data to city Bremen
df_prophet = df.query('city == "Bremen"')
df_prophet = df_prophet[['timestamp','PM2p5','location_id']]

# rename columns to expected format for prophet
df_prophet.rename(columns={'timestamp': 'ds', 'PM2p5': 'y'}, inplace=True)

# prophet can not handle nans in dataframe
df_prophet.dropna(inplace=True)

# create list of location ids for later loop
location_list_bremen = df_prophet['location_id'].unique().tolist()

# check number of locations
df_prophet.location_id.nunique()

## Prophet model for all locations in Bremen (without regressors)

In [None]:
# create dictionary to later save models
prophet_models_per_location = {}

# loop over location ids
for n, location_id in enumerate(location_list_bremen):
    # only take first 5 locations for test
    if n<5:
        print('-----'*10)
        print(f'no: {n + 1}, location_id: {location_id}')

        # limit dataframe to specific location_id
        df_location = df_prophet[df_prophet['location_id'] == location_id]
        df_location.drop('location_id', axis=1, inplace=True)

        # init Prophet model and fit it to train data for one location
        model = Prophet(yearly_seasonality=True)
        model.fit(df_location)

        # save model in dictionary
        prophet_models_per_location[location_id] = model

In [None]:
# create dictionary to save prediction results
prophet_forecasts_per_location = {}
for location_id in location_list_bremen[:5]:
    print('-----'*10)
    print(f'location_id: {location_id}')

    # load model for current location
    model = prophet_models_per_location[location_id]
    
    # create dataframe for future predictions and predict
    future = model.make_future_dataframe(periods=960, freq='H')
    forecast = model.predict(future)

    # save predictions in dataframe
    prophet_forecasts_per_location[location_id] = forecast

In [None]:
# plot models
for location_id in location_list_bremen[:5]:
    model = prophet_models_per_location[location_id]
    fig1 = model.plot(prophet_forecasts_per_location[location_id])


In [None]:
# plot components of models
for location_id in location_list_bremen[:5]:
    model = prophet_models_per_location[location_id]
    fig2 = model.plot_components(prophet_forecasts_per_location[location_id])

In [None]:
# interactive plot for location_id 125 (this does not work in a loop)
model = prophet_models_per_location[125]
plot_plotly(model, forecast) 

# Prophet with regressors

In [None]:
# preparing dataframe with regressors
df_prophet_reg = df.query('city == "Bremen"')

# first regressor is temperature_dwd
df_prophet_reg = df_prophet_reg[['timestamp','PM2p5','location_id', 'temperature_dwd']]
df_prophet_reg.rename(columns={'timestamp': 'ds', 'PM2p5': 'y', 'temperature_dwd': 'temp'}, inplace=True)

# drop nans
print(df_prophet_reg.isna().sum())
df_prophet_reg.dropna(inplace=True)

location_list_bremen_reg = df_prophet_reg['location_id'].unique().tolist()

#df_prophet_reg.location_id.nunique()

# limit to one location 125
location_id = 125

df_prophet_reg = df_prophet_reg[df_prophet_reg['location_id'] == location_id]
df_prophet_reg.drop('location_id', axis=1, inplace=True)

In [None]:
df_prophet_reg

In [None]:
model_reg = Prophet(yearly_seasonality=True)
model_reg.add_regressor('temp')
model_reg.fit(df_prophet_reg)

In [None]:
# identify lat and long for chosen location_id
print(df[df['location_id'] == 125]['lat'].unique())
print(df[df['location_id'] == 125]['lon'].unique())
# 53.014	8.886	

In [None]:
# load test data
df_test_data = pd.read_csv('../data/processed_sensor_dwd_test.csv', index_col=0)
df_test_data['timestamp'] = pd.to_datetime(df_test_data['timestamp'])

# limit to location_id of train data (because we do not have location_id here yet)
# location_id 125 --> 53.014	8.886	
df_test_data = df_test_data[(df_test_data['lat'] == 53.014) & (df_test_data['lon'] == 8.886)]

In [None]:
df_test_data

In [None]:
def weather_temp(ds):
    """Get a temperature of train or test data for corresponding timestamp

    Args:
        ds (datetime): timestamp

    Returns:
        float: temperature value for given timestamp
    """
    
    if ds in df_prophet_reg['ds'].values:
        return df_prophet_reg[df_prophet_reg['ds'] == ds]['temp'].values[0]
    elif ds in df_test_data['timestamp'].values:
        return df_test_data[df_test_data['timestamp'] == ds]['temperature_dwd'].values[0]
    else:
        return np.nan
   

In [None]:
# create future dataframe and apply function to fill temp column with train and test data
future_reg = model_reg.make_future_dataframe(periods=960, freq='H')
future_reg['temp'] = future_reg['ds'].apply(weather_temp)

In [None]:
future_reg

In [None]:
# drop nans
future_reg.dropna(inplace=True)

In [None]:
# make prediction
forecast_reg = model_reg.predict(future_reg)

In [None]:
forecast_reg

In [None]:
# model without temperature regressor
fig1 = model.plot(prophet_forecasts_per_location[125])

In [None]:
# model with temperature regressor
fig1 = model_reg.plot(forecast_reg)

In [None]:
# without regressor
fig2 = model.plot_components(prophet_forecasts_per_location[location_id])

In [None]:
# with temperature regressor
fig2 = model_reg.plot_components(forecast_reg)

In [None]:
# without regressor
model = prophet_models_per_location[125]
plot_plotly(model, forecast) 

In [None]:
# with temperature regressor
plot_plotly(model_reg, forecast_reg) 

# Calculating RMSE

In [None]:
# limit columns of test data for calculating rmse
df_rmse = df_test_data[['timestamp', 'PM2p5']]

# merge forecast to test data
df_rmse = df_rmse.merge(forecast_reg[['yhat', 'ds']], how='left', left_on='timestamp', right_on='ds')
df_rmse.drop(columns='ds', axis=1, inplace=True)

df_rmse.dropna(inplace=True)

df_rmse

In [None]:
# calculate rmse for specific time span
rmse = mean_squared_error(np.asarray(df_rmse.loc[0:48,['PM2p5']]), np.asarray(df_rmse.loc[0:48,['yhat']]), squared=False)

rmse