https://machinelearningmastery.com/make-predictions-time-series-forecasting-python/

# **Auto Regression Implementation**
*This notebook processes data provided by the [IBM Call for Code Wildfire Challenge](https://community.ibm.com/community/user/datascience/blogs/susan-malaika/2020/11/10/call-for-code-spot-challenge-for-wildfires). The unpreprocessed data is available [on GitHub](https://github.com/Call-for-Code/Spot-Challenge-Wildfires).*

The notebook consists of the following process steps:

*   loading preprocessed data and weather forecast
*   split data into training, validation and testing data
*   generate predictions on validation and testing data
*   plot result

In [None]:
!pip install --upgrade statsmodels

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import math
import seaborn as sns

from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.api import VAR
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from google.colab import drive

plt.rcParams['figure.figsize'] = (10,8)
plt.rcParams['axes.grid'] = False

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Wildfire_Challenge/Data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


*   loading preprocessed data and weather forecast

In [None]:
forecasts_df = pd.read_csv('Cleansed_Forecasts.csv')
df = pd.read_csv('Cleansed_Data.csv')

forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])
df['Date'] = pd.to_datetime(df['Date'])

data = df[df['Region'] == region].set_index(['Date'],inplace=False)

*   generate predictions on validation data

In [None]:
result_val = pd.DataFrame(columns=['Date', 'Region', 'PREDICTIONS', 'ACTUAL'])

for region in ['NSW', 'QL', 'TA', 'VI', 'SA', 'WA', 'NT']:
  series = df[['Date','Estimated_fire_area']][df['Region'] == region].set_index(['Date'],inplace=False)
  # split dataset
  X = series.values
  size = int(len(X)-100)
  train, test = X[0:size], X[size:-15]
  # train autoregression
  window = 6
  model = AutoReg(train, lags=8)
  model_fit = model.fit()
  coef = model_fit.params

  # walk forward over time steps in test
  history = [train[i] for i in range(len(train))]
  predictions = list()
  for t in range(len(test)):
    yhat = model_fit.predict(coef, history)
    obs = test[t]
    predictions.append(yhat)
    history.append(obs)

  region_result = pd.DataFrame(predictions, columns=['PREDICTIONS'])
  region_result['ACTUAL'] = test
  region_result['Region'] = region
  region_result['Date'] = pd.date_range(start='24/10/2020', end='16/1/2021')
  
  result_val = pd.concat([result_val,region_result])
  rmse = math.sqrt(mean_squared_error(test, predictions))
  print('Test RMSE: %.3f' % rmse)
  # plot
  plt.plot(test)
  plt.plot(predictions, color='red')
  #plt.show()

result_val

*   generate predictions on test data

In [None]:
result_df = pd.DataFrame(columns=['Date', 'Region', 'PREDICTIONS', 'ACTUAL'])

for region in ['NSW', 'QL', 'TA', 'VI', 'SA', 'WA', 'NT']:
  series = df[['Date','Estimated_fire_area']][df['Region'] == region].set_index(['Date'],inplace=False)
  # split dataset
  X = series.values
  size = int(len(X)-15)
  train, test = X[0:size], X[size:]
  # train autoregression
  window = 6
  model = AutoReg(train, lags=8)
  model_fit = model.fit()
  coef = model_fit.params

  # walk forward over time steps in test
  history = [train[i] for i in range(len(train))]
  predictions = list()
  for t in range(len(test)):
    yhat = predict(coef, history)
    obs = test[t]
    predictions.append(yhat)
    history.append(obs)

  region_result = pd.DataFrame(predictions, columns=['PREDICTIONS'])
  region_result['ACTUAL'] = test
  region_result['Region'] = region
  region_result['Date'] = pd.date_range(start='17/1/2021', end='31/1/2021')
  
  result_df = pd.concat([result_df,region_result])
  rmse = math.sqrt(mean_squared_error(test, predictions))
  print('Test RMSE: %.3f' % rmse)
  # plot
  plt.plot(test)
  plt.plot(predictions, color='red')
  #plt.show()

In [None]:
def get_prediction_results(df):
  print('ACCURACY RESULTS')
  print('Root Mean Squared Error:',round(mean_squared_error(np.asarray(df.ACTUAL), np.asarray(df.PREDICTIONS), squared=False),2))
  print('R2 Score:',round(r2_score(np.asarray(df.ACTUAL), np.asarray(df.PREDICTIONS)),2))

  print(df.head(5))
  print('...')
  print(df.tail(5))
  print('_______________________________')
  df.reset_index(inplace=True)
  #return df_result

val_result = get_prediction_results(result_val)
test_result = get_prediction_results(result_df)

*   plot result


In [None]:
plt.figure(figsize=(16, 6))

result_df['DIFFERENCE'] = result_df.PREDICTIONS - result_df.ACTUAL
g = sns.lineplot(x="Date", y="DIFFERENCE", hue='Region', data=result_df)