# Prophecy of ATM Withdrawals

Agus Gunawan, Holy Lovenia

## Importing dataset

In [6]:
from datetime import datetime
from pandas import read_csv
import pandas as pd
from os import listdir, mkdir
from os.path import exists, isfile, join

### Read train data

#### Functions

In [7]:
def get_files_from_dir_path(dir_path):
    files = []
    
    for file in listdir(dir_path):
        if file.endswith('.csv'):
            files.append(file)

    return files

In [8]:
def import_all_datasets(base_src_dir_path):

    src_files = get_files_from_dir_path(base_src_dir_path)
    date_parser = lambda dates: datetime.strptime(dates, '%Y-%m-%d')
    
    datasets = {}

    for i in range(0, len(src_files)):
        current_src_file = src_files[i]
        current_id = current_src_file.split('.')[0]
        current_id = current_id.split('_')[0]
        datasets[int(current_id)] = read_csv(base_src_dir_path + current_src_file, parse_dates=['date'], date_parser=date_parser)
        datasets[int(current_id)] = datasets[int(current_id)].rename(columns={'date': 'ds', 'Withdrawals': 'y'})
        
    return datasets

### Import all split training datasets

Due to different natures and patterns generated by each ATM machine, the training dataset was split based on the ATM machines, e.g. K1, K2, ... ATM machine has its own dataset respectively.

In [9]:
train_datasets = import_all_datasets('dataset/train/')

## Prophet model building

In [10]:
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_cross_validation_metric, plot_yearly, plot_weekly

import calendar

### Define Payday (holiday seasonality)

During the end of the month, usually the `Withdrawals` value gets higher

In [11]:
gajian = pd.DataFrame({
  'holiday' : 'gajian',
  'ds' : pd.to_datetime(['2018-03-30', '2018-02-28', '2018-01-31']),
  'lower_window' : -2,
  'upper_window' : 2}
)

holidays = gajian

### Define weekly seasonality for Sunday

On Sundays, `Withdrawals` is almost half of the other days

In [12]:
def take_money(ds):
    date = pd.to_datetime(ds)
    switcher = {
        6: 0.5
    }
    return switcher.get(date.weekday(), 1)

### Adding regressor column for Sunday's `take_money` in dataset

In [13]:
for i in range(1, len(train_datasets) + 1):
    train_datasets[i]['take_money'] = train_datasets[i]['ds'].apply(take_money)

### Training Prophet models for each dataset

In this step, each model is trained using its own dataset. An additional regressor for Sunday's `take_money` (weekly seasonality) is added for every model.

In [14]:
prophets = {}

for i in range(1, len(train_datasets) + 1):
    prophet = Prophet(yearly_seasonality=False, 
                      weekly_seasonality=False, 
                      daily_seasonality=False, 
                      holidays=holidays)

    prophet.add_regressor(name='take_money', mode='multiplicative')
    prophet.fit(train_datasets[i])
    
    prophets[i] = prophet

  elif np.issubdtype(np.asarray(v).dtype, float):
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 3.0.
  np.linspace(0, hist_size - 1, self.n_changepoints + 1)


### Forecasting `Withdrawals`

For the sake of demonstration, let's just predict the next seven days using the first 10 ATM machines.

In [17]:
forecast_data = {}
for i in range(1, 10 + 1):
    if i % 10 == 0:
        print(str(i) + ' from ' + str(len(prophets)))
    future_data = prophets[i].make_future_dataframe(periods=7, freq='d')
    future_data['take_money'] = future_data['ds'].apply(take_money)
    
    forecast_data[i] = prophets[i].predict(future_data)

10 from 10626


## Performance measure

The performance measure for 10 first ATM machines is computed using cross-validation.

### Get performance metrics with cross-validation

In [19]:
pm = {}
for i in range(1, 10 + 1):
    cv = cross_validation(prophets[i], horizon='7 days')
    pm[i] = performance_metrics(cv)

INFO:fbprophet.diagnostics:Making 16 forecasts with cutoffs between 2018-01-23 12:00:00 and 2018-03-17 00:00:00
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 17.0.
  np.linspace(0, hist_size - 1, self.n_changepoints + 1)
  elif np.issubdtype(np.asarray(v).dtype, float):
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 20.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 23.0.
INFO:fbprophet.diagnostics:Making 16 forecasts with cutoffs between 2018-01-23 12:00:00 and 2018-03-17 00:00:00
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 17.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 20.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 23.0.
INFO:fbprophet.diagnostics:Making 16 forecasts with cutoffs between 2018-01-23 12:00:00 and 2018-03-17 00:00:00
INFO:fbprophet.forecast

### Show averaged MSE and MAPE from each data point

In [20]:
for i in range(1, len(pm) + 1):
    print(i, pm[i][['mse']].mean(), pm[i][['mape']].mean())

1 mse    7.794492e+14
dtype: float64 mape    0.184619
dtype: float64
2 mse    9.802238e+14
dtype: float64 mape    0.186738
dtype: float64
3 mse    4.466868e+14
dtype: float64 mape    0.217522
dtype: float64
4 mse    3.015438e+15
dtype: float64 mape    0.234772
dtype: float64
5 mse    5.630637e+15
dtype: float64 mape    0.23573
dtype: float64
6 mse    4.079609e+14
dtype: float64 mape    0.2262
dtype: float64
7 mse    8.503684e+15
dtype: float64 mape    0.253915
dtype: float64
8 mse    4.105107e+15
dtype: float64 mape    0.247882
dtype: float64
9 mse    1.069176e+15
dtype: float64 mape    0.410304
dtype: float64
10 mse    2.152590e+15
dtype: float64 mape    0.274852
dtype: float64


## Result

### Preparing the answers

In [None]:
temp_forecast_data = forecast_data.copy()

In [None]:
for i in range(1, len(temp_forecast_data) + 1):
    temp_forecast_data[i]['no. ATM'] = "K" + str(i)

In [None]:
for i in range(1, len(temp_forecast_data) + 1):
    temp_forecast_data[i] = temp_forecast_data[i].rename(columns={'ds': 'date'})

In [None]:
answer = {}

for i in range(1, len(temp_forecast_data) + 1):
    answer[i] = temp_forecast_data[i].loc[temp_forecast_data[i]['date'] > '2018-03-24']
    answer[i] = answer[i][['no. ATM', 'date', 'yhat']]
    answer[i] = answer[i].rename(columns={'yhat': 'prediction'})
    if i % 10 == 0:
        print(str(i) + ' from ' + str(len(temp_forecast_data)))

### Concat all of the answers into a single `DataFrame`

In [None]:
final_answer = pd.DataFrame()
final_answer_list = []

for i in range(1, len(answer) + 1):
    final_answer_list.append(answer[i])
        
final_answer = pd.concat(final_answer_list)

### Save it as CSV

In [None]:
final_answer.to_csv('result/prediction.csv', index=False)