In [22]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input
import plotly.graph_objects as go
import pandas as pd

from statsmodels.tsa.stattools import adfuller
from scipy.stats import jarque_bera
from pmdarima.arima import auto_arima

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


###############################################
########### Forecast Functions ################
###############################################


def test_for_stationarity(ts):
    """
    INSERT DOCSTRING
    """
    result = adfuller(ts)

    p_value = result[1]

    if p_value < 0.05:
        print("The time series is stationary.")
        return True
    else:
        print("The time series is not stationary.")
        return False

def test_for_linearity(ts):
    """
    INSERT DOCSTRING
    """
    _, p_value = jarque_bera(ts)

    if p_value < 0.05:
        print("The time series exhibits non-randomness and potential linearity.")
        return True
    else:
        print("The time series exhibits randomness and no significant linearity.")
        return False
    
    
def perform_arima_forecast(ts, train_size_pct, needs_stationarity=True, needs_linearity=True):
    """
    INSERT BETTER DOCSTRING
    
    Takes in a time series, a training percent split, a boolean flag to indicate the need for 
    a stationarity test and one for a linearity test.
    
    If more than one diff is required, print this and move on.
    If non-linear, print this and move on. Logic to address multiple diffs and changes to address linearity are needed.
    
    Pass the time series data into an auto arima model.
    Return components that will input into other portions of the Auto ARIMA workflow.
    
    """
    diff_counter = 0
    linearity_counter = 0
    non_stationary = False
    
    while not test_for_stationarity(ts):
        if diff_counter > 0:
            print("Time series is not stationary, suggest removing stationarity flag'")
            non_stationary = True
            break
        ts = ts.diff().dropna()
        diff_counter += 1
        
    
    if not test_for_linearity(ts):
        print("Time series is not linear, suggest removing linearity flag")
        
    
    train_size = int(len(ts) * train_size_pct)
    df_train_data = ts[:train_size]
    df_test_data = ts[train_size:]
    
    arima_model = auto_arima(df_train_data, start_p=0, d=1, start_q=0, max_p=5, max_d=5, max_q=5,
                            start_P=0, D=1, start_Q=0, max_P=5, max_D=5, max_Q=5, m=12,
                            seasonal=True, error_action='warn', trace=True, suppress_warnings=True,
                            stepwise=True, random_state=42, n_fits=50)
    
    return df_train_data, df_test_data, train_size, (diff_counter > 0), non_stationary, arima_model

def make_arima_predictions(arima_model, df_test_data, steps=12):
    """
    INSERT BETTER DOCSTRING
    
    Makes predictions using the tuned ARIMA model and the test_data.
    Forecast from the end of the training data: 
        Steps to cover the length of the test data + the target number of steps after the original 
        time series ends.
    
    """
    df_prediction = pd.DataFrame(arima_model.predict(n_periods=len(df_test_data)+steps))
    df_prediction.columns = ['values']
    df_prediction.index = pd.to_datetime(df_prediction.index)
    return df_prediction

def convert_diff_to_original(df_orig_ts, df_diffed_ts):
    """
    INSERT BETTER DOCSTRING
    
    If a diff was applied to the original time series.
    
    Start with the very last value of original time series i.e. the training set
    Create a series containing a cumulative sum of the diffed values.
    Add the diffed values to the last value and broadcast forward.
    Using the predictions for the last step, this converts values to the same scale as the original time series.
    
    """
    last_observed_value = df_orig_ts.iloc[-1,]
    if isinstance(last_observed_value, pd.core.series.Series):
        last_observed_value = last_observed_value[0]
    else:
        pass
    # Calculate the cumulative sum of the differenced forecast
    cumulative_sum = df_diffed_ts.cumsum()

    
    # Add the cumulative sum to the last observed value
    undifferenced_forecast = cumulative_sum['values'] + last_observed_value
    
    # Create a new DataFrame with the undifferenced values and the appropriate index
    df_full_forecast = pd.DataFrame(undifferenced_forecast, index=undifferenced_forecast.index)
    
    # full_forecast now contains the forecasted values in the original scale of the 'arima_ts' series
    df_full_forecast.index = pd.to_datetime(df_full_forecast.index)
    
    return df_full_forecast

def build_final_forecast(df_train_data, df_predictions):
    """
    INSERT BETTER DOCSTRING
    
    Take the train and predictions and join them.
    Take the non-null values and add column names
    
    """
    last_date_arima_ts = df_train_data.index[-1]
    df_arima_forecast = df_predictions[df_predictions.index > last_date_arima_ts]
    df_arima_ts = pd.concat([df_train_data, df_arima_forecast])
    df_arima_ts.columns = ['All house types', 'values']
    df_arima_ts['value_type'] = df_arima_ts.index.map(lambda x: 'Forecast' if x > last_date_arima_ts else 'Original')
    df_arima_ts['value'] = df_arima_ts['All house types'].fillna(df_arima_ts['values'])
    df_arima_ts = df_arima_ts.drop(['All house types', 'values'], axis=1)
    return df_arima_ts


def get_accuracy_metrics(df_test_data, df_predictions, non_stationary):
    """
    INSERT BETTER DOCSTRING
    
    Calculate performance metrics
    
    """
    # Calculate MAE and RMSE
    arima_mae = mean_absolute_error(df_test_data, df_predictions)
    arima_rmse = np.sqrt(mean_squared_error(df_test_data, df_predictions))
    arima_r2 = r2_score(df_test_data, df_predictions)
    print(f'MAE: {arima_mae}')
    print(f'RMSE: {arima_rmse}')
    print(f'R2: {arima_r2}')
    print(f'Non-stationary: {non_stationary}')
    return {'mae':arima_mae, 'rmse':arima_rmse, 'r2':arima_r2, 'non_stat': non_stationary}


def begin_arima_workflow(ts, train_size_pct, steps=12):
    """
    INSERT BETTER DOCSTRING
    
    Run the whole workflow.
    
    """
    df_train_data, df_test_data, train_size, diff_flag, non_stationary, arima_model = perform_arima_forecast(ts, train_size_pct)
    df_auto_arima_predictions = make_arima_predictions(arima_model, df_test_data, steps=steps)
    if diff_flag:
        df_auto_arima_ts_temp = convert_diff_to_original(ts[:train_size], df_auto_arima_predictions)
        df_auto_arima_ts = build_final_forecast(ts[:train_size], df_auto_arima_ts_temp).dropna()
    else:
        df_auto_arima_ts = build_final_forecast(ts[:train_size], df_auto_arima_predictions).dropna()
    dict_accuracy_metrics = get_accuracy_metrics(df_test_data, df_auto_arima_predictions[:len(df_test_data)], non_stationary)
    return df_auto_arima_ts, dict_accuracy_metrics


#################################################################################################################
########### Data Prep - ENSURE THAT THE CSVs ARE REFERENCD - THEY ARE INCLUDED IN THE SUBMISSION ################
#################################################################################################################

county_data = pd.read_csv("./datasets/dashboard/build_data.csv", index_col=0) ## CHECK THIS FILE
county_data['Quarter'] = pd.to_datetime(county_data['Quarter'])
county_data.set_index('Quarter', inplace=True, drop=True)

country_data = pd.read_csv("./datasets/dashboard/pop_data.csv", index_col=0) ## CHECK THIS FILE
country_data['yr_qtr'] = pd.to_datetime(country_data['yr_qtr'])
country_data.set_index('yr_qtr', inplace=True, drop=True)
country_data = country_data[['ie_pop']]
all_data = pd.concat([country_data, county_data], axis=1).dropna()

###############################################
########### Dashboard Layout ##################
###############################################

app = dash.Dash(__name__)
app.title = "Dashboard for Forecasting Expected New Builds in Ireland"

app.layout = html.Div([
    html.Div([
        html.H1("Dashboard for Forecasting Expected New Builds in Ireland", style={'font-size': '24pt'}),
    ], style={'text-align': 'center', 'margin-bottom': '20px'}),
    
    html.Div([
        html.Div([
            html.Div([
                html.Label('County'),
                dcc.Dropdown(
                    id='county-dropdown',
                    options=[{'label': col, 'value': col} for col in all_data.columns],
                ),
            ], className='dropdown-container', style={'margin-bottom': '20px', 'display': 'inline-block', 'width': '45%'}),
            
            html.Div([
                html.Label('% of dataset to use in Training'),
                dcc.Slider(
                    id='training-size-slider',
                    min=0.5,
                    max=1.0,
                    step=0.05,
                    value=0.9,
                    marks={i/10: f"{int(i*10)}%" for i in range(1, 11)}
                )
            ], className='slider-container', style={'margin-bottom': '20px', 'display': 'inline-block', 'width': '45%'}),
        ], className='top-container'),
        
        html.Div([
            html.Label('How many quarters into the future'),
            dcc.Slider(
                id='arima-slider',
                min=1,
                max=24,
                step=1,
                value=8,
                marks={i: str(i) for i in range(1, 24)}
            )
        ], style={'margin-bottom': '20px'}),  # Add style for margin-bottom
        
        html.Div([
            html.Div([
                html.Label('MAE'),
                html.Div(id='mae-value', className='text-area')
            ], className='accuracy-container', style={'flex': '1', 'padding-right': '10px'}),
            
            html.Div([
                html.Label('RMSE'),
                html.Div(id='rmse-value', className='text-area')
            ], className='accuracy-container', style={'flex': '1', 'padding-right': '10px'}),
            
            html.Div([
                html.Label('Non-Stationary'),
                html.Div(id='non-stat-value', className='text-area')
            ], className='accuracy-container', style={'flex': '1', 'padding-right': '10px'}),
            
            html.Div([
                html.Label('R2'),
                html.Div(id='r2-value', className='text-area')
            ], className='accuracy-container', style={'flex': '1'}),
        ], className='accuracy-row', style={'display': 'flex'}),
        
        dcc.Graph(id='forecast-plot'),
    ])
])


################################################
########### Callback & Update ##################
################################################

@app.callback(
    Output('forecast-plot', 'figure'),
    Output('county-dropdown', 'value'),
    Output('mae-value', 'children'),
    Output('rmse-value', 'children'),
    Output('non-stat-value', 'children'),
    Output('r2-value', 'children'),
    Input('arima-slider', 'value'),
    Input('training-size-slider', 'value'),
    Input('county-dropdown', 'value')
)
def update_forecast_plot(arima_value, train_size_value, county_value):
    """
    INSERT BETTER DOCSTRING
    """
    # Perform ARIMA-based forecasting and obtain the forecasted values based on the selected value
    ts = all_data[county_value]

    df_arima_ts, accuracy_stats = begin_arima_workflow(ts, train_size_value, steps=arima_value)    
    
    color_map = {'Forecast': 'orange', 'Original': 'green'}
    df_arima_ts['val_color'] = df_arima_ts['value_type'].str.strip().replace(color_map)
    
    # Create the line plot
    fig = go.Figure()
    
    # Add chart with specified colors
    for val_color, data in df_arima_ts.groupby('val_color'):
        fig.add_trace(go.Scatter(x=data.index, y=data['value'], name=val_color, line=dict(color=val_color)))
    
    # Rename the legend labels
    newnames = {'orange':'Forecast', 'green': 'Historical'}
    fig.for_each_trace(lambda t: t.update(name=newnames[t.name],
                                           legendgroup=newnames[t.name])
                       )
    
    # Update the layout
    fig.update_layout(
        title=f"{county_value} - {arima_value} Quarters Forecast",
        xaxis_title='Quarter',
        yaxis_title='Value',
        xaxis=dict(showgrid=True, gridcolor='rgba(0, 0, 0, 0.1)'),  # Turn on gridlines for x-axis with lower alpha
        yaxis=dict(showgrid=True, gridcolor='rgba(0, 0, 0, 0.1)'),
        plot_bgcolor='white', 
        legend=dict(
            title='Value Type',
            itemsizing='constant',
            itemclick=False,
            itemdoubleclick=False,
            bgcolor='rgba(0,0,0,0)',
            orientation='h',
            yanchor='bottom',
            y=-0.5,
            xanchor='center',
            x=0.5
        ),
        legend_title_font=dict(size=14),
        legend_tracegroupgap=0,
        margin=dict(t=80)
    
    )

    # Extract accuracy values and round them to 3 decimal places
    arima_mae = round(accuracy_stats['mae'], 3)
    arima_rmse = round(accuracy_stats['rmse'], 3)
    arima_r2 = round(accuracy_stats['r2'], 3)
    arima_non_stat = "Yes" if accuracy_stats['non_stat'] else "No"

    return fig, county_value, arima_mae, arima_rmse, arima_non_stat, arima_r2


################################################
################# MAIN  ########################
################################################

if __name__ == '__main__':
    app.run_server(debug=False)


Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:8050
Press CTRL+C to quit
127.0.0.1 - - [26/May/2023 23:45:23] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2023 23:45:23] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2023 23:45:23] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2023 23:45:23] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 304 -
[2023-05-26 23:45:23,968] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_

The time series is not stationary.
The time series is stationary.
The time series exhibits randomness and no significant linearity.
Time series is not linear, suggest removing linearity flag
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=253.060, Time=0.04 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=239.555, Time=0.03 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=inf, Time=0.05 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=238.143, Time=0.01 sec
 ARIMA(1,1,0)(0,1,1)[12]             : AIC=239.567, Time=0.03 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=inf, Time=0.10 sec
 ARIMA(2,1,0)(0,1,0)[12]             : AIC=234.191, Time=0.01 sec
 ARIMA(2,1,0)(1,1,0)[12]             : AIC=236.078, Time=0.03 sec
 ARIMA(2,1,0)(0,1,1)[12]             : AIC=236.090, Time=0.03 sec
 ARIMA(2,1,0)(1,1,1)[12]             : AIC=inf, Time=0.16 sec
 ARIMA(3,1,0)(0,1,0)[12]             : AIC=226.642, Time=0.01 sec
 ARIMA(3,1,0)(1,1,0)[12]             : AIC=228.225, 

127.0.0.1 - - [26/May/2023 23:45:26] "POST /_dash-update-component HTTP/1.1" 200 -


The time series is not stationary.
The time series is not stationary.
Time series is not stationary, suggest removing stationarity flag'
The time series exhibits non-randomness and potential linearity.
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=337.875, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=333.462, Time=0.05 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=inf, Time=0.04 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=331.778, Time=0.01 sec
 ARIMA(1,1,0)(0,1,1)[12]             : AIC=333.490, Time=0.03 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=335.430, Time=0.07 sec
 ARIMA(2,1,0)(0,1,0)[12]             : AIC=308.445, Time=0.02 sec
 ARIMA(2,1,0)(1,1,0)[12]             : AIC=310.408, Time=0.05 sec
 ARIMA(2,1,0)(0,1,1)[12]             : AIC=310.409, Time=0.05 sec
 ARIMA(2,1,0)(1,1,1)[12]             : AIC=312.408, Time=0.09 sec
 ARIMA(3,1,0)(0,1,0)[12]             : AIC=308.046, Time=0.02 sec
 ARIMA(3,1,0)(1,1,0)[12]         

127.0.0.1 - - [26/May/2023 23:45:31] "POST /_dash-update-component HTTP/1.1" 200 -


 ARIMA(2,1,1)(1,1,1)[12]             : AIC=306.373, Time=0.39 sec
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=inf, Time=0.02 sec
 ARIMA(2,1,2)(0,1,0)[12]             : AIC=inf, Time=0.04 sec
 ARIMA(1,1,2)(0,1,0)[12]             : AIC=inf, Time=0.03 sec
 ARIMA(3,1,2)(0,1,0)[12]             : AIC=inf, Time=0.04 sec
 ARIMA(2,1,1)(0,1,0)[12] intercept   : AIC=inf, Time=0.04 sec

Best model:  ARIMA(2,1,1)(0,1,0)[12]          
Total fit time: 3.912 seconds
MAE: 113.33237033421601
RMSE: 157.36991687903625
R2: 0.06937427142181052
Non-stationary: True
The time series is stationary.
The time series exhibits randomness and no significant linearity.
Time series is not linear, suggest removing linearity flag
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=415.131, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=418.604, Time=0.06 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=418.315, Time=0.04 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=417.100, Tim

127.0.0.1 - - [26/May/2023 23:45:32] "POST /_dash-update-component HTTP/1.1" 200 -


 ARIMA(0,1,0)(1,1,1)[12]             : AIC=inf, Time=0.04 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=416.670, Time=0.01 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=416.374, Time=0.01 sec
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=417.950, Time=0.01 sec
 ARIMA(0,1,0)(0,1,0)[12] intercept   : AIC=416.770, Time=0.00 sec

Best model:  ARIMA(0,1,0)(0,1,0)[12]          
Total fit time: 0.231 seconds
MAE: 894.4
RMSE: 998.3268002012167
R2: 0.43365107388590707
Non-stationary: False
The time series is stationary.
The time series exhibits randomness and no significant linearity.
Time series is not linear, suggest removing linearity flag
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=515.476, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=866.339, Time=0.07 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=inf, Time=0.09 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=516.560, Time=0.04 sec
 ARIMA(0,1,0)(0,1,1)[12]             : AIC=531.428, Time

127.0.0.1 - - [26/May/2023 23:45:36] "POST /_dash-update-component HTTP/1.1" 200 -


 ARIMA(5,1,2)(0,1,0)[12]             : AIC=483.323, Time=0.09 sec
 ARIMA(4,1,1)(0,1,0)[12] intercept   : AIC=inf, Time=0.09 sec

Best model:  ARIMA(4,1,1)(0,1,0)[12]          
Total fit time: 2.997 seconds
MAE: 28018.254738916457
RMSE: 35134.56627356417
R2: -53.0657668731565
Non-stationary: False
