<a href="https://colab.research.google.com/github/kessingtonosazee/GCP_Project_1/blob/master/Linear_Regression_heatmap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ARMA Models

# Libraries
import inspect
import time
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from IPython.display import VimeoVideo
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

warnings.filterwarnings("ignore")

# Prepare Data
# Create a client object for a MongoDB instance.
# Access a database using PyMongo.
# Access a collection in a database using PyMongo.
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]

def wrangle(collection,resample_rule="1H"):

    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample and forward-fill
    y = df["P2"].resample(resample_rule).mean().fillna(method='ffill')

    return y

# Subset a DataFrame by selecting one or more rows in pandas.
t = y[717:1485]
cutoff_test = int(len(t)*.97)
y_train = t.iloc[:cutoff_test]
y_test = t.iloc[cutoff_test:]

#Append an item to a list in Python.
#Calculate the mean absolute error for a list of predictions in scikit-learn.
#Instantiate a predictor in statsmodels.
#Train a model in statsmodels.
#Write a for loop in Python.

# Create a range in Python.
p_params = range(0,25,8)
q_params = range(0,3,1)

# Create dictionary to store MAEs
mae_grid = {}

# Outer loop: Iterate through possible values for `p`
for p in p_params:

    # Create key-value pair in dict. Key is `p`, value is empty list.
    mae_grid[p] = []

    # Inner loop: Iterate through possible values for `q`
    for q in q_params:

        # Combination of hyperparameters for model
        order = (p, 0, q)

        # Note start time
        start_time = time.time()

        # Train model
        model = ARIMA(y_train, order = order).fit()

        # Calculate model training time
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA {order} in {elapsed_time} seconds.")

        # Generate in-sample (training) predictions
        y_pred = model.predict()

        # Calculate training MAE
        mae = mean_absolute_error(y_train, y_pred)

        # Append MAE to list in dictionary
        mae_grid[p].append(mae)

print()
print(mae_grid)

# Create a DataFrame from a dictionary using pandas.
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)

# Create a heatmap in seaborn.
sns.heatmap(mae_df, cmap="Blues")
plt.xlabel("p values")
plt.ylabel("q valuea")
plt.title("ARMA Grid Search [Criterion:MAE]");

# Examine time series model residuals using statsmodels.
fig, ax = plt.subplots(figsize=(15, 12))
model.plot_diagnostics(fig=fig);

# walk forward prediction with optimized search
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = ARIMA(history, order=(8,0,1)).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])
