In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [2]:
# function to sort all rides by station
# for daily and weekly aggregation, we need to combine multiple days worth of CSVs beforehand 
# what's the column name for Stations ? it's not Acceso_Estacion 

def sortbystations(csv):
    stations = csv["Acceso_Estacion"].unique() 
    grouped = csv.groupby(["Acceso_Estacion"]).count()
    max_rides = max(grouped.iloc[:,1])
    collection_df = pd.DataFrame()
    for i in np.arange(0, len(stations)-1):
        df_station = csv.loc[csv["Acceso_Estacion"] == stations[i]]
        times = df_station["Fecha_Transaccion"].tolist()
        title = stations[i]
        if not len(times) == max_rides:
            times.extend(['']*(max_rides-len(times)))
        collection_df[title] = times
    return collection_df

# aggregation function
# options for rule input: '15min' [by 15 min time periods], 'D' [by day], 'W' [by week], etc
# options for collection_df_onecolumn input: select single column from collection_df (output of sortbystations function) to see 
#                                            trips through one station OR input whole CSV to see trips through whole system

def aggregation(collection_df_onecolumn, rule):
    datetime = pd.to_datetime(collection_df_onecolumn)
    frequency = pd.Series(np.ones(np.shape(collection_df_onecolumn)))
    formatted_timefreq = pd.concat([datetime, frequency], axis=1)
    formatted_timefreq.columns = ['DateTime', 'Frequency']
    formatted_timefreq = formatted_timefreq.set_index('DateTime')
    aggregated = formatted_timefreq['Frequency'].resample(rule).sum()
    return aggregated

In [3]:
# function that displays regular plot, autocorrelation plot, and partial autocorrelation plot for each level of differencing
# use autocorrelation plot to decide differencing (d) and AR (p)
# use partial autocorrelation plot to decide MA (q)
# note: AR factors correct for slight under-differencing, MA factors correct for slight over-differencing

def differencing_acf_pacf_plots(aggregated):
    fig, axes = plt.subplots(3, 3, figsize=(20,15))
    axes[0, 0].plot(aggregated); axes[0, 0].set_title('Original Series')
    plot_acf(aggregated, ax=axes[0, 1])
    plot_pacf(aggregated, ax=axes[0, 2])

    # 1st Differencing
    axes[1, 0].plot(aggregated.diff()); axes[1, 0].set_title('1st Order Differencing')
    plot_acf(aggregated.diff().dropna(), ax=axes[1, 1])
    plot_pacf(aggregated.diff().dropna(), ax=axes[1, 2])

    # 2nd Differencing
    axes[2, 0].plot(aggregated.diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
    plot_acf(aggregated.diff().diff().dropna(), ax=axes[2, 1])
    plot_pacf(aggregated.diff().diff().dropna(), ax=axes[2, 2])

    plt.show()
    return

In [1]:
# function to apply ARIMA to aggregated dataset, returns plot of predicted vs actual test dataset

def applyARIMA(aggregated, order=(4,0,0)):
    X = aggregated.values
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:len(X)]
    history = [x for x in train]
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=order) # adjust parameters (p,d,q) based on ACF and PACF plots
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
    test_and_predictions = pd.DataFrame({'test': test.flatten(), 'predictions': predictions})
    plt.plot(test)
    plt.plot(predictions, color='red')
    plt.show()
    return test_and_predictions

# MAPE function
# filters out time periods with 0 actual trips and replaces negative predicted trips with 0

def MAPE(test_and_predictions):
    predictions = test_and_predictions['predictions']
    for i in np.arange(0,len(predictions)-1):
        if predictions[i] < 0:
            predictions[i] = 0
    data = test_and_predictions
    data = data[data['test'] != 0]
    return np.mean(np.abs((data['test'] - data['predictions']) / data['test'])) * 100

In [5]:
# function to apply SARIMA to aggregated dataset, returns plot of predicted vs actual test dataset

def applySARIMA(aggregated, order=(4,0,0), seasonal_order=(0,0,0,0)):
    X = aggregated.values
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:len(X)]
    history = [x for x in train]
    predictions = list()
    for t in range(len(test)):
        model = SARIMAX(history, order=order, seasonal_order=seasonal_order) # adjust parameters (p,d,q) based on ACF and PACF plots
        model_fit = model.fit() 
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
    test_and_predictions = pd.DataFrame({'test': test.flatten(), 'predictions': predictions})
    plt.plot(test)
    plt.plot(predictions, color='red')
    plt.show()
    return test_and_predictions