Great source for time series:
https://www.machinelearningplus.com/time-series/time-series-analysis-python/

In [None]:
# Data Source: https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data/notebooks?datasetId=29&sortBy=voteCount

# IMPORT DATASETS AND LIBRARIES

In [None]:
#Instalations: #Instalations: 

!pip install plotly
!pip install chart_studio
# !pip install prophet
!pip install statsmodels
!pip3 install --user scipy==1.2.0
!pip install statsmodels --upgrade
! pip install optuna

#After the installation- need to restart the runtime


In [None]:
#restart the runtime so the installation we did above will be refreshed
import os

def restart_runtime():
  os.kill(os.getpid(), 9) 

In [None]:
# import libraries

#data explanatory and manipulating
import pandas as pd
from copy import copy
import numpy as np
# !pip install missingno
# import missingno as msno 
from datetime import date, timedelta,datetime
import itertools 


#Visualization
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import seaborn as sns
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


#Preperation for model:
from pathlib import Path
from pylab import rcParams
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

#Statistical model
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA

#Prophet model
# from prophet import Prophet
# from prophet.plot import add_changepoints_to_plot, plot_forecast_component, plot_yearly
# from prophet.diagnostics import cross_validation
# from prophet.utilities import regressor_coefficients

#DNN model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Bidirectional,Dropout

# Random forest:
from sklearn.ensemble import RandomForestRegressor

#Hyper parameter tuning:
import optuna
from optuna.trial import TrialState
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore") #, category=FutureWarning


In [None]:
# Read the file
temperature_df = pd.read_csv('GlobalLandTemperaturesByCountry.csv')

# sns.set(style = 'whitegrid')
plt.style.use('seaborn-dark')
plt.figure(figsize=(12,8))
plt.rcParams['font.size'] = '16'



In [None]:
'''
Features:
  - dt: Date on which records were observed
  - AverageTemperature : Mean temperature of the country 
  - AverageTemperatureUncertainity: Uncertainty associated with recorded temperature 
  - Country: Country name
'''

In [None]:
temperature_df

In [None]:
# Add Dates Columns
#Century:
temperature_df['Century']=temperature_df['dt'].apply(lambda x: x.split('-')[0]).str[:2]
temperature_df['Century']=temperature_df['Century'].astype(int)
temperature_df['Century']=temperature_df['Century']+1

#year:
temperature_df['Year']=temperature_df['dt'].apply(lambda x: x.split('-')[0])
temperature_df['Year']=temperature_df['Year'].astype(int)


#month:
temperature_df['Month']=temperature_df['dt'].apply(lambda x: x.split('-')[1])
temperature_df['Month']=temperature_df['Month'].astype(int)


In [None]:
temperature_df

# EXPLORATORY DATA ANALYSIS

In [None]:
temperature_df.describe()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(temperature_df['AverageTemperature'])

In [None]:
palette = ['tab:blue', 'tab:green', 'tab:orange', 'tab:red']
palette = {
    18: 'tab:blue',
    19: 'tab:purple',
    20: 'tab:orange',
    21: 'tab:red'
}

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=temperature_df, x='AverageTemperature', hue='Century',palette=palette)

In [None]:
plt.figure(figsize=(10,6))
ax= sns.barplot(data=temperature_df,x='Century', y='AverageTemperature', palette=palette)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
    
    

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(temperature_df['AverageTemperatureUncertainty'])

In [None]:
plt.figure(figsize=(10,6))
plt.rcParams['font.size'] = '16'
ax=sns.barplot(data=temperature_df,x='Century', y='AverageTemperatureUncertainty', palette=palette)
# ax.set_xlabel('Century', fontsize=16)
# ax.set_ylabel('Average Temperature Uncertainty', fontsize=16)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
#WorldWide temperture per month, year, century
ww_temp_group_by = temperature_df.groupby(["Century","Year", "Month"]).mean('AverageTemperature')
ww_temp_group_by.reset_index(inplace=True)
ww_temp_group_by

In [None]:
fig = px.line(ww_temp_group_by,x='Year', y='AverageTemperature', color='Month')
fig.update_layout(autosize=False,width=1000,height=500)
fig.show()

In [None]:
#Animation per month
fig = px.line(ww_temp_group_by,x='Year', y='AverageTemperature', color='Month',animation_frame='Month')
fig.show()

In [None]:
# To get the global average tempeature over years
df_global = temperature_df.groupby('Year').mean().reset_index()
df_global['Year'] = df_global['Year'].apply(lambda x: int(x))

# Uncertainity upper bound 
trace1 = go.Scatter(
    x = df_global['Year'], 
    y = np.array(df_global['AverageTemperature']) + np.array(df_global['AverageTemperatureUncertainty']), # Adding uncertinity
    name = 'Uncertainty top',
    line = dict(color = 'blue'))

# Uncertainity lower bound
trace2 = go.Scatter(
    x = df_global['Year'] , 
    y = np.array(df_global['AverageTemperature']) - np.array(df_global['AverageTemperatureUncertainty']), # Subtracting uncertinity
    fill = 'tonexty',
    name = 'Uncertainty bottom',
    line = dict(color = 'blue'))

# Recorded temperature
trace3 = go.Scatter(
    x = df_global['Year'] , 
    y = df_global['AverageTemperature'],
    name = 'Average Temperature',
    line = dict(color='purple'))
data = [trace1, trace2, trace3]

layout = go.Layout(
    xaxis = dict(title = 'Year'),
    yaxis = dict(title = 'Average Temperature, °C'),
    title = 'Average Land Temperatures Globally',
    showlegend = False)

fig = go.Figure(data = data, layout = layout)
fig.update_layout(autosize=False,width=1000,height=500)
py.iplot(fig)


In [None]:
ww_temp_country_count = temperature_df.groupby("Year").count()
ww_temp_country_count.reset_index(inplace=True)

fig = px.bar(ww_temp_country_count,x='Year', y='Country')
fig.show()


conclusion- I will remove records before 1857 and also 2013 (not fully observed)

In [None]:
temperature_df_manipulated =temperature_df.copy()

In [None]:
# Convert the year to int type and use the data above 1857 for visualization (Before this most of the countries do not have recorded reading)
temperature_df_manipulated = temperature_df_manipulated.loc[(temperature_df_manipulated['Year'] > 1857 )&(temperature_df_manipulated['Year'] <2013)]

In [None]:
temperature_df_manipulated.reset_index(inplace=True,drop=True)
temperature_df_manipulated

In [None]:
plt.figure(figsize=(10,6))
ax1= sns.barplot(data=temperature_df_manipulated,x='Century', y='AverageTemperature', palette=palette)
for p in ax1.patches:
    ax1.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.xlabel('Century', fontsize=16)
plt.ylabel('Average Temperature', fontsize=16)
    
plt.figure(figsize=(10,6))
ax2=sns.barplot(data=temperature_df_manipulated,x='Century', y='AverageTemperatureUncertainty', palette=palette)
for p in ax2.patches:
    ax2.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.xlabel('Century', fontsize=16)
plt.ylabel('Average Temperature Uncertainty', fontsize=16)

In [None]:
# To get the global average tempeature over years
df_global = temperature_df_manipulated.groupby('Year').mean().reset_index()
df_global['Year'] = df_global['Year'].apply(lambda x: int(x))

# Uncertainity upper bound 
trace1 = go.Scatter(
    x = df_global['Year'], 
    y = np.array(df_global['AverageTemperature']) + np.array(df_global['AverageTemperatureUncertainty']), # Adding uncertinity
    name = 'Uncertainty top',
    line = dict(color = 'blue'))

# Uncertainity lower bound
trace2 = go.Scatter(
    x = df_global['Year'] , 
    y = np.array(df_global['AverageTemperature']) - np.array(df_global['AverageTemperatureUncertainty']), # Subtracting uncertinity
    fill = 'tonexty',
    name = 'Uncertainty bottom',
    line = dict(color = 'blue'))

# Recorded temperature
trace3 = go.Scatter(
    x = df_global['Year'] , 
    y = df_global['AverageTemperature'],
    name = 'Average Temperature',
    line = dict(color='purple'))
data = [trace1, trace2, trace3]

layout = go.Layout(
    xaxis = dict(title = 'Year'),
    yaxis = dict(title = 'Average Temperature, °C'),
    title = 'Average Land Temperatures Globally',
    showlegend = False)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig)


# DATA MANIPULATION:


1.   Remove duplications in countries
2.   Remove countries with only few datapoints
3.   data imputation



Remove duplications:

In [None]:
temperature_df_full=temperature_df_manipulated.copy()

In [None]:
temperature_df_full

In [None]:
# Check the unique countries
temperature_df_manipulated['Country'].unique()

In [None]:
#Unify countries that has the same name, for example: 'United Kingdom (Europe)' & 'United Kingdom'
duplicates=temperature_df_manipulated[temperature_df_manipulated['Country'].str.contains('(',regex=False)]#regex=False because we are searching for '(' value. otherwise it will raise an error
duplicates=duplicates['Country'] #DataFrame
duplicates=duplicates.unique() #to get a list of the duplicates countries 


In [None]:
for i in duplicates:
  num=i.find(' (')
  print(i,num)
  temperature_df_manipulated.replace(i,i[0:num], inplace=True) 


In [None]:
#Check that there is no duplicated
duplicates=temperature_df_manipulated[temperature_df_manipulated['Country'].str.contains('(',regex=False)]#regex=False because we are searching for '(' value. otherwise it will raise an error
duplicates

In [None]:
# Do groupby country to see the count of values available for each country
country_group_df = temperature_df_manipulated.groupby(by = 'Country').count().reset_index('Country').rename(columns={'AverageTemperature':'AverageTemperatureCount','AverageTemperatureUncertainty' : 'AverageTemperatureUncertaintyCount'})
country_group_df=country_group_df.sort_values(by='AverageTemperatureCount')

In [None]:
fig = px.bar(country_group_df, x = 'Country', y = 'AverageTemperatureCount', title="number of records per country")
fig.show()

Remove countries with only few datapoints (less then 15% (6332*0.15= 950))

In [None]:
# missing_records_threshold=950

In [None]:
# countries_with_less_data=country_group_df[(country_group_df['AverageTemperatureCount'] < missing_records_threshold) | (country_group_df['AverageTemperatureUncertaintyCount'] < missing_records_threshold)]
# countries_with_less_data=countries_with_less_data.Country.tolist()

# countries_with_less_data

countries_with_less_data = country_group_df[(country_group_df['AverageTemperatureCount'] < 1300) | (country_group_df['AverageTemperatureUncertaintyCount'] < 1300)]
countries_with_less_data=countries_with_less_data['Country'].to_list()

In [None]:
countries_with_less_data

In [None]:
# Remove the countries with less data points
temperature_df_manipulated = temperature_df_manipulated[~temperature_df_manipulated['Country'].isin(countries_with_less_data)]
# Reset the index
temperature_df_manipulated.reset_index(inplace = True, drop = True)
temperature_df_manipulated

# Data Imputation:

Fill the missing values by doing rolling average per country and monthin the past 20 years


In [None]:
# Check for missing values
temperature_df_manipulated.isnull().sum()

In [None]:
#Option #1:Fill the missing values by doing rolling average per country and monthin the past 20 years
# temperature_df_manipulated['MA'] = temperature_df_manipulated.groupby(['Country','Month']).AverageTemperature.transform(lambda x: x.rolling(20,min_periods=1).mean())
# temperature_df_manipulated['MA2'] = temperature_df_manipulated.groupby(['Country','Month']).AverageTemperatureUncertainty.transform(lambda x: x.rolling(20,min_periods=1).mean())

# temperature_df_manipulated.AverageTemperature.fillna(temperature_df_manipulated.MA, inplace=True)
# temperature_df_manipulated.AverageTemperatureUncertainty.fillna(temperature_df_manipulated.MA2, inplace=True)

# del temperature_df_manipulated['MA']
# del temperature_df_manipulated['MA2']

# temperature_df_manipulated

#Option #2: Fill the missing values by doing rolling average on past 730 months
temperature_df_manipulated['AverageTemperature'] = temperature_df_manipulated['AverageTemperature'].fillna(temperature_df_manipulated['AverageTemperature'].rolling(240, min_periods = 1).mean())
temperature_df_manipulated['AverageTemperatureUncertainty']= temperature_df_manipulated['AverageTemperatureUncertainty'].fillna(temperature_df_manipulated['AverageTemperatureUncertainty'].rolling(240, min_periods=1).mean())

In [None]:
temperature_df_manipulated.isna().sum()

In [None]:
#Drop null records after imputation
# temperature_df=temperature_df[~temperature_df['AverageTemperature'].isnull()]

# DATA VISUALIZATION

In [None]:
countries = temperature_df_manipulated['Country'].unique().tolist()

In [None]:
# Get the mean temperature for each country
mean_temperature = []
for i in countries:
    mean_temperature.append(temperature_df_manipulated[temperature_df_manipulated['Country'] == i]['AverageTemperature'].mean())

In [None]:
# Plot the mean teamperature of countries
data = [ dict(type = 'choropleth', # type of map
              locations = countries, # location names
              z = mean_temperature, # temperature of countries
              locationmode = 'country names')
       ]

layout = dict(title = 'Average Global Land Temperatures',
              geo = dict(showframe = False,
                         showocean = True, # to show the ocean
                         oceancolor = '#3399FF',
                         projection = dict(type = 'orthographic'))) # to get the globe view),


fig = dict(data = data, layout = layout)
py.iplot(fig, validate = False, filename = 'worldmap')

In [None]:
temperature_df_manipulated

Indexing

In [None]:
# To get the global average tempeature over years
df_global_monthly = temperature_df_manipulated.groupby(['dt']).mean().reset_index()
# df_global_monthly=df_global_monthly.drop(['Century','Year'],axis=1)
df_global_monthly=df_global_monthly.drop(['Century','Year'],axis=1).set_index('dt')





In [None]:
date_index = pd.to_datetime(df_global_monthly.index)
date_index = pd.DatetimeIndex(date_index.values, freq = date_index.inferred_freq)
infered_freq=date_index.inferred_freq
print("freq:",date_index.freq)
print("infered freq:",date_index.inferred_freq)
print(date_index)
infered_freq

In [None]:
df_global_monthly.index.astype(date_index)
# df_global_monthly.index.freq = df_global_monthly.index.freq.apply(lambda x: infered_freq if x == 'None'  else df_global_monthly.index.freq)
# df_global_monthly=df_global_monthly.drop('AverageTemperatureUncertainty',axis=1)
# print (df_global_monthly.index.freq)


In [None]:
df_for_arima=df_global_monthly.copy()
df_for_arima=df_for_arima.drop(['AverageTemperatureUncertainty','Month'],axis=1)
df_for_arima

In [None]:
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(df_for_arima.asfreq(infered_freq) ,model='additive') #infered_freq in this case- 'MS' (Month start)
fig = decomposition.plot()
plt.show()


In [None]:
print(decomposition.seasonal)

#Define the DataSet

In [None]:
df_global_monthly

# Stationary check- pre preperations before time-series models:
Will be checked according to ADF test.

In [None]:
# statistic test (ADF) to check to stationary hypothesis
'''
if The p-value is greater than significance level of 0.05 or 0.01 (what we choose) 
and the ADF statistic is higher than any of the critical values, WE ARE NOT rejecting
the null hypothesis (HO) which means, the time series is in fact non-stationary.

to summarize: 
if ADF statistic > critical value --> we DO NOT reject HO -->The data is not stationary
if ADF statistic < critical value --> we DO reject HO --> The data is stationary

If reject HO=False- The data is not stationary
If reject HO=True- The data is stationary
'''

from statsmodels.tsa.stattools import adfuller
import math

def ADF_Cal(x):
    result = adfuller(x)
    ADF_stat = result[0]
    print("adf",ADF_stat )
    p = result[1]
    if math.isnan(p):
      ADF_stat=0
      p=0
    print("ADF Statistic: %f" % ADF_stat)
    print("p-value: %f" % p)
    print("Critical Values")
    levels = [.01, .05, .1]
    i = 0
    for key,value in result[4].items():
        print('\t%s: %.3f' % (key,value))
        hyp = p < levels[i]
        if ADF_stat < value:
            cert = (1-levels[i])*100
            print("{}% certain this is staionary".format(cert))
            print('Reject H0: {}'.format(hyp))
            break
        i = i+1
        if i >= 3:
            print("Less than 90% certain that data is stationary")
            print('Reject H0: {}'.format(hyp))
    return hyp

In [None]:
#Need to check only one numerical column
#AverageTemperature column
to_check1=df_global_monthly['AverageTemperature']
#AverageTemperatureUncertainty column
# to_check2=df_global_monthly['AverageTemperatureUncertainty']

print('ADF test for AverageTemperature column: ')
hyp=ADF_Cal(to_check1)
# print('\n\nADF test for AverageTemperatureUncertainty column: ')
# hyp=ADF_Cal(to_check2)

Conclusion- data is not stationary.
need to transform the data (can be done by Difference Transform)

In [None]:
year_df=df_global_monthly.copy()
year_df['dt']=year_df.index
year_df['Year']=year_df['dt'].apply(lambda x: x.split('-')[0])
year_df_grouped=year_df.groupby('Year').mean()
year_df_grouped=year_df_grouped['AverageTemperature']
year_df_grouped


In [None]:
plt.plot(year_df_grouped)


# PERPARE THE DATA TO TRAIN THE MODEL




#Train- Test split

In [None]:
train_split=0.9
val_size=0.2

# # Split the data
# train = df_global_monthly[:int(0.9 * len(df_global_monthly))].drop(columns = 'dt').values
# test = df_global_monthly[int(0.9 * len(df_global_monthly)):].drop(columns = 'dt').values
# train,val=train_test_split(train,test_size=0.2,shuffle=False)
train = df_global_monthly[:int(train_split * len(df_global_monthly))]
test = df_global_monthly[int(train_split * len(df_global_monthly)):]

#for Arima model:
train_arima = df_for_arima[:int(train_split * len(df_for_arima))]
test_arima = df_for_arima[int(train_split * len(df_for_arima)):]


In [None]:
last_train_value=train.iloc[-1,0]
last_train_value

In [None]:
train

In [None]:
a=df_global_monthly.copy()
a['rolling_avg']=a['AverageTemperature'].rolling( 12).mean()

# Plot train and test data
fig=plt.figure(figsize = (18, 12))
ax=plt.plot(train.AverageTemperature,color='purple')
# ax=plt.plot(val.Target,color='pink')
ax= plt.plot(test.AverageTemperature,color='blue')
plt.ylabel('Average Temperature')
plt.legend(['Train set','validation set', 'Test set'], loc='lower right',fontsize=22)


# plot using rolling average
sns.lineplot( x = a.index,
             y = 'rolling_avg',
             data = a,
             label = 'Rollingavg',
             color='black',
             linewidth=2.5)

print('Dimension of train data: ',train.shape)
# print('Dimension of validation data: ',val.shape)
print('Dimension of test data: ', test.shape)

#Create 'Lagged' Data Frame

In [None]:
# Function that creates the data for training the time series model
def prepare_data(df, num_of_lag_differences): #,seasonal_cycle
    # Get the columns
    column_name = 'AverageTemperature'
    # For the given range, create lagged input feature for the given columns
    for i in range(( num_of_lag_differences ),0, -1):
      name =column_name + '_t-' + str(i)
      df[name] = df[column_name].shift((i)) 
    # Create the target by using next value as the target
    df['Target'] = df['AverageTemperature']
    return df

num_of_lag_differences=12
# Get the training data
train = prepare_data(train, num_of_lag_differences)
train = train.dropna()
train=train.drop(['AverageTemperature'],axis=1)


In [None]:
train


#Scaling

In [None]:
# Scale the data
scaler = MinMaxScaler(feature_range = (0, 1))
train_sc  = scaler.fit_transform(train)
# # val_sc  = scaler.transform(val)
# test   = scaler.transform(test)


In [None]:
# # Split the data into input features and targets
# #All features:
# train_x, train_y = train_sc[:,:-1], train_sc[:,-1] #train data is already scaled- this it's Numpy array and not dataFrame
# # val_x, val_y = val_sc[:,:-1], val_sc[:,-1]  # VAL data is already scaled- this it's Numpy array and not dataFrame
# test_x, test_y = test.iloc[:,1:], test.iloc[:,0] # I'm using iloc because the test data is still un-scaled dataFrame. the Y value in this DF is at location 0


Blocking Time Series Split

In [None]:
#Cross validation method in time series- Blocking Time Series Split
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)
        split_ratio=0.8

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            split = int(split_ratio * (stop - start)) + start
            yield i,indices[start: split], indices[split + margin: stop]
        

In [None]:
def plot_cv_indices(cv, X, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (subset,tr, tt) in enumerate(cv.split(X=X,  groups=None)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)
        
    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class']
    ax.set(yticks=np.arange(n_splits+2), yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+1.2, -.1], xlim=[0, 100])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
from matplotlib.patches import Patch
cmap_cv = plt.cm.coolwarm
cmap_data=0

n_splits=5
tscv = BlockingTimeSeriesSplit(n_splits=n_splits)
fig, ax = plt.subplots(figsize=(10, 5))  
ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
         ['validation set', 'Training set'], loc=(1.02, .8))
plt.tight_layout()
plt.xlabel('Sample index')
plt.title('Blocking Time Series Split')
fig.subplots_adjust(right=.7)    
# plot_cv_indices(tscv, train, ax, n_splits) 
plt.show()

# BUILD AND TRAIN RNN AND LSTM MODELS FOR PREDICTING GLOBAL TEMPERATURE TREND

**Reshape to 3D input dataset:**
LSTM, GRU and BiLSTM take a 3D input (num_samples, num_timesteps, num_features). I have create a helper function, 'prepaere data' and another function: "Reshape data set"  to reshape input.
In this project, I define time_steps =12 for the **prepare data'** function. It means that the dataset makes 12 new features of previous 12 months average temperature.

because of this pre-processing stage- i donr need to define to tensor any orhe time-stamp but 1, therefor, in the **"reshape_data_set"** function i will set timestamp to be 1 because in the data set i have prepared i have already representation of the last 12 previous values (as features).

if i had skipped the **prepare data'** function, i didnt have representation of the last 12 months so i have to reshape the data to the tensor to be:
(num of samples/12,12,num of features (1 for univarialte model))



In [None]:
def reshape_data_sets(*args): #must be np.array
  for i in args:
    a=i.reshape((i.shape[0], 1, i.shape[1]))
    return a

In [None]:
# #reshape: (sample size,time_stamp (see explanation above), features)
# train_x_rs,val_x_rs=reshape_data_sets(train_x,val_x)

In [None]:
# train_x_rs=train_x.reshape((train_x.shape[0], 1, train_x.shape[1]))
# # val_x_rs=val_x.reshape((val_x.shape[0], 1, val_x.shape[1]))

In [None]:
# train_x_rs.shape

In [None]:
def customLoss(y_true, y_pred):
  loss=tf.math.divide_no_nan(tf.abs(y_true-y_pred),y_true)
  return tf.reduce_mean(loss)*100

Create models, train and fit

### ARIMA

In [None]:
train_arima

In [None]:
arima_model = sm.tsa.statespace.SARIMAX(train_arima.asfreq(infered_freq),
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 0, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

arima_results = arima_model.fit()

print(arima_results.summary().tables[1])

In [None]:
arima_results.plot_diagnostics(figsize=(15, 12))
plt.show()

#interpatations: https://www.digitalocean.com/community/tutorials/a-guide-to-time-series-forecasting-with-arima-in-python-3

In [None]:
test_arima

In [None]:
start_test_data=test_arima.index[0]
start_test_data

In [None]:
pred = arima_results.get_prediction(start=pd.to_datetime(start_test_data), dynamic=False)
pred_ci = pred.conf_int()

In [None]:
df_for_arima_new=df_for_arima.merge(pred_ci,left_index=True, right_index=True)

In [None]:
df_for_arima_new

In [None]:
df_for_arima.index=df_for_arima.index.astype(date_index)

In [None]:
df_for_arima_new=pd.concat([df_for_arima, pred_ci], axis=1)

In [None]:
df_for_arima_new['Average prediction']= df_for_arima_new.iloc[:,1:3].mean(axis=1)
df_for_arima_new['dt']=df_for_arima_new.index
 # Calculate the error 
df_for_arima_new['Error'] = np.abs(df_for_arima_new['AverageTemperature'] - df_for_arima_new['Average prediction'])
df_for_arima_new['APE']=(df_for_arima_new['Error']/df_for_arima_new['AverageTemperature'])*100
df_for_arima_new

MAPE_arima=np.mean(df_for_arima_new['APE'])

  # pred['dt'] = df.index[-test_x.shape[0]:]
  # # Calculate the error 
  # pred['Error'] = np.abs(pred['Original'] - pred['Predicted'])
  # pred['APE']=pred['Error']/pred['Original']*100
        
  # # Create dataframe for visualization
  # df_new=df.copy()
  # df_new['dt']=df_new.index #Bring back the 'dt' to the dataframe
  # df_new = df_new[['dt','Target']][:-test_x.shape[0]]
  # df_new.columns = ['dt','Original']
  # original = df_new.append(pred[['dt','Original']])
  # df_new.columns = ['dt','Predicted']
  # predicted = df_new.append(pred[['dt','Predicted']])
  # original = original.merge(predicted, left_on = 'dt',right_on = 'dt')

  # MAPE=np.mean(pred['APE'])

In [None]:
accuracy_arima=100-MAPE_arima
accuracy_arima

In [None]:
df_for_arima_new

#RNN'S

In [None]:
dropout=0.3

In [None]:
def create_shallow_RNN_model(train_x_rs,val_x_rs,hidden_units,optimizer,epochs,batchsize): #dropout,,activation1,activation2
    # Create the model
    inputs = keras.layers.Input(shape = (train_x_rs.shape[1], train_x_rs.shape[2]))
    x = keras.layers.SimpleRNN(hidden_units)(inputs)
    # x = keras.layers.Dropout(0.3)(x)
    # x = keras.layers.SimpleRNN(hidden_units,return_sequences = True)(x)
    # x = keras.layers.Dropout(0.3)(x)
    # x = keras.layers.SimpleRNN(hidden_units)(x)
    outputs = keras.layers.Dense(1, activation = 'linear')(x)

    model_shallow_rnn = keras.Model(inputs = inputs, outputs = outputs)
    model_shallow_rnn.compile(optimizer=optimizer,loss=customLoss,metrics=['mean_absolute_percentage_error'])

    history = model_shallow_rnn.fit(train_x_rs, train_y,
                    epochs=epochs,
                    verbose=0,
                    validation_data=(val_x_rs, val_y),
                    batch_size=batchsize)

    return history,model_shallow_rnn

In [None]:
def create_RNN_model(train_x_rs,val_x_rs,hidden_units,optimizer,epochs,batchsize): #dropout,,activation1,activation2
    # Create the model
    inputs = keras.layers.Input(shape = (train_x_rs.shape[1], train_x_rs.shape[2]))
    x = keras.layers.SimpleRNN(hidden_units,return_sequences =  True)(inputs)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.SimpleRNN(hidden_units,return_sequences = True)(x)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.SimpleRNN(hidden_units)(x)
    outputs = keras.layers.Dense(1, activation = 'linear')(x)

    model_rnn = keras.Model(inputs = inputs, outputs = outputs)
    model_rnn.compile(optimizer=optimizer,loss=customLoss,metrics=['mean_absolute_percentage_error'])

    history = model_rnn.fit(train_x_rs, train_y,
                    epochs=epochs,
                    verbose=0,
                    validation_data=(val_x_rs, val_y),
                    batch_size=batchsize)

    return history,model_rnn

In [None]:
def create_LSTM_model(train_x_rs,val_x_rs,hidden_units,optimizer,epochs,batchsize): #dropout,,activation1,activation2
    # Create the model
    inputs = keras.layers.Input(shape = (train_x_rs.shape[1], train_x_rs.shape[2]))
    x = keras.layers.LSTM(hidden_units,return_sequences =  True)(inputs)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.LSTM(hidden_units, return_sequences = True)(x)
    x = keras.layers.Dropout(0.3)(x)
    x = keras.layers.LSTM(hidden_units)(x)
    outputs = keras.layers.Dense(1, activation = 'linear')(x)

    model_lstm = keras.Model(inputs = inputs, outputs = outputs)
    model_lstm.compile(optimizer=optimizer,loss=customLoss,metrics=['mean_absolute_percentage_error'])

    history = model_lstm.fit(train_x_rs, train_y,
                    epochs=epochs,
                    verbose=0,
                    validation_data=(val_x_rs, val_y),
                    batch_size=batchsize)

    return history,model_lstm

choose randomally hyper parameters for RNN and LSTM

In [None]:
hidden_units=5
optimizer='adam'
epochs=100
batchsize=10

tscv = BlockingTimeSeriesSplit(n_splits)
scores_shallow_rnn=[]
scores_rnn = []
scores_lstm = []

#get the block-time-series single fold indices
for sub_set,train_index, val_index in tscv.split(train_sc):
  # print("\n\nsub_set:", sub_set,"\nTRAIN:\n", train_index, "\nval:\n", val_index)

  #subset the scaled train set into rtain and validation according to indeices we got from the prev. step
  train_x = train_sc[train_index,:-1]
  val_x = train_sc[val_index,:-1]
  train_y = train_sc[train_index,-1]
  val_y = train_sc[val_index,-1]
  print (train_x.shape[0],1,train_x.shape[1])
  print (val_x.shape[0],1,val_x.shape[1])

  # #reshape: (sample size,time_stamp (see explanation above), features)
  train_x_rs=train_x.reshape(train_x.shape[0],1,train_x.shape[1])
  val_x_rs=val_x.reshape(val_x.shape[0],1,val_x.shape[1])
  

  #get the block-time-series single fold indices
for sub_set,train_index, val_index in tscv.split(train_sc):
  print ("Fold #",sub_set+1)
 
  #RNN
  history_rnn,model_RNN = create_RNN_model(train_x_rs,val_x_rs,hidden_units=hidden_units,optimizer=optimizer,epochs=epochs,batchsize=batchsize)
  val_rnn_score=100-history_rnn.history['val_loss'][-1]
  # print('MAPE DEEP RNN:  ',history_rnn.history['val_loss'][-1])
  scores_rnn.append(val_rnn_score)

  #LSTM
  history_lstm,model_LSTM = create_LSTM_model(train_x_rs,val_x_rs,hidden_units=hidden_units,optimizer=optimizer,epochs=epochs,batchsize=batchsize)
  # print('MAPE DEEP LSTM:  ',history_lstm.history['val_loss'][-1])
  val_lstm_score=100-history_lstm.history['val_loss'][-1]
  scores_lstm.append(val_lstm_score)

print('Scores from each Iteration- Deep RNN: {}\n Scores from each Iteration- Deep LSTM: {}\n\n'.format(scores_rnn,scores_lstm))
print('Average CV Score- Deep RNN :{}\n Average CV Score- Depp LSTM :{}\n' .format(np.mean(scores_rnn),np.mean(scores_lstm) ))



In [None]:
def plot_history(history):
    # plot history
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='val')  
    plt.xlabel('Epochs', fontsize=16)
    plt.ylabel('Loss', fontsize=16)  
    # plt.grid()
    plt.legend(fontsize=20)
    plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True,figsize=(20,10))

ax1.set_title("Deep RNN model",fontsize=20)
ax1.plot(history_rnn.history['loss'], label='train')
ax1.plot(history_rnn.history['val_loss'], label='val')
plt.legend(fontsize=20)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)  

ax2.set_title("Deep LSTM model",fontsize=20)
ax2.plot(history_lstm.history['loss'], label='train')
ax2.plot(history_lstm.history['val_loss'], label='val')
plt.legend(fontsize=20)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Loss', fontsize=16)  


In [None]:
fig = plt.subplots(figsize=(20,10))
# fig.set_title("Loss",fontsize=20)
plt.plot(history_rnn.history['val_loss'], label='Deep RNN')
plt.plot(history_lstm.history['val_loss'], label='Deep LSTM')
plt.title("val Loss through epochs",fontsize=20)
plt.legend(fontsize=20)



In [None]:
def prediction_persistence(test):

  pred=pd.DataFrame(test.iloc[:,0])
  pred['Predicted'] = last_train_value
  pred['dt'] = test.index[-test.shape[0]:]
  # Calculate the error 
  pred['Error'] = np.abs(pred['AverageTemperature'] - pred['Predicted'])
  pred['APE']=pred['Error']/pred['AverageTemperature']*100

  MAPE=np.mean(pred['APE'])
  return pred,MAPE



Validation Score:

In [None]:
pred_base_line, MAPE_base_line=prediction_persistence(test)


In [None]:
val_rnn_score=np.mean(scores_rnn)
val_lstm_score=np.mean(scores_lstm)



In [None]:
validation_score=pd.DataFrame()
models=['Deep RNN','Deep LSTM','Naive-Baseline']
validation_score['Models']=['Deep RNN','Deep LSTM','Naive-Baseline']
scores=[val_rnn_score,val_lstm_score,(100-MAPE_base_line)] 
validation_score['validation_score']=[f"{num:.1f}" for num in scores]
validation_score['validation_score']=validation_score['validation_score'].astype(float)

validation_score.info()


In [None]:

fig = plt.figure(figsize = (10, 5))
#  Bar plot
plt.bar(validation_score['Models'], validation_score['validation_score'], color ='maroon',
        width = 0.5)
matplotlib.rcParams.update({'font.size': 16})
for x,y in zip(validation_score['Models'], validation_score['validation_score']):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.xlabel("Model")
plt.ylabel("Model Score")
plt.title("Model Score: 100-(Mean Absolute Percentage Error)\n")
plt.show()

#OPTUNA- Hyper Parameter tuning

In [None]:
def objective(trial):
  hidden_units=trial.suggest_int("hidden_units", 10, 100,step=10, log=False)
  optimizer_name = trial.suggest_categorical("optimizer", ["adam", "SGD", "RMSprop"])
  epochs = trial.suggest_int("epochs", 5, 40,step=5, log=False)
  batchsize = trial.suggest_int("batchsize", 1, 51,step=5, log=False)

  scores_lstm = []
  for sub_set,train_index, val_index in tscv.split(train_sc):
    #LSTM
    history_lstm,model_LSTM = create_LSTM_model(train_x_rs,val_x_rs,hidden_units=hidden_units,optimizer=optimizer,epochs=epochs,batchsize=batchsize)
    val_lstm_per_fold=100-history_lstm.history['val_loss'][-1]
    scores_lstm.append(val_lstm_per_fold)

  val_mape=np.mean(scores_lstm)
  # val_mape = model_LSTM.evaluate(val_x_rs,val_y)[1]
  weights = model_LSTM.get_weights()

  # Handle pruning based on the intermediate value.
  if trial.should_prune():
    raise optuna.exceptions.TrialPruned()

  trial.set_user_attr(key="best_model_weights", value=weights)
  return val_mape

def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model_weights", 
                            value=trial.user_attrs["best_model_weights"])

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=None, callbacks=[callback])
print(study.best_trial.value)  # Show the best value.


In [None]:
study.best_trial.value


In [None]:
study.best_trial.params

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
hidden_units=80
optimizer='RMSprop'
epochs=100
batchsize=26

# {'batchsize': 26, 'epochs': 30, 'hidden_units': 80, 'optimizer': 'RMSprop'}

In [None]:
scores_shallow_rnn=[]
scores_rnn = []
scores_lstm = []

#get the block-time-series single fold indices
for sub_set,train_index, val_index in tscv.split(train_sc):
  # print("\n\nsub_set:", sub_set,"\nTRAIN:\n", train_index, "\nval:\n", val_index)

  #subset the scaled train set into rtain and validation according to indeices we got from the prev. step
  train_x = train_sc[train_index,:-1]
  val_x = train_sc[val_index,:-1]
  train_y = train_sc[train_index,-1]
  val_y = train_sc[val_index,-1]
  print (train_x.shape[0],1,train_x.shape[1])
  print (val_x.shape[0],1,val_x.shape[1])

  # #reshape: (sample size,time_stamp (see explanation above), features)
  train_x_rs=train_x.reshape(train_x.shape[0],1,train_x.shape[1])
  val_x_rs=val_x.reshape(val_x.shape[0],1,val_x.shape[1])
  

  #get the block-time-series single fold indices
for sub_set,train_index, val_index in tscv.split(train_sc):
 
  #RNN
  history_rnn,model_RNN = create_RNN_model(train_x_rs,val_x_rs,hidden_units=hidden_units,optimizer=optimizer,epochs=epochs,batchsize=batchsize)
  val_rnn_score=100-history_rnn.history['val_loss'][-1]
  # print('MAPE DEEP RNN:  ',history_rnn.history['val_loss'][-1])
  scores_rnn.append(val_rnn_score)

  #LSTM
  history_lstm,model_LSTM = create_LSTM_model(train_x_rs,val_x_rs,hidden_units=hidden_units,optimizer=optimizer,epochs=epochs,batchsize=batchsize)
  # print('MAPE DEEP LSTM:  ',history_lstm.history['val_loss'][-1])
  val_lstm_score=100-history_lstm.history['val_loss'][-1]
  scores_lstm.append(val_lstm_score)

print('Scores from each Iteration- Deep RNN: {}\n Scores from each Iteration- Deep LSTM: {}\n\n'.format(scores_rnn,scores_lstm))
print('Average CV Score- Deep RNN :{}\n Average CV Score- Depp LSTM :{}\n' .format(np.mean(scores_rnn),np.mean(scores_lstm)))

Validation Score sfter OPTUNA:

In [None]:
val_rnn_score=np.mean(scores_rnn)
val_lstm_score=np.mean(scores_lstm)



In [None]:
validation_score=pd.DataFrame()
models=['Deep RNN','Deep LSTM', 'Naive-Baseline',]
validation_score['Models']=['Deep RNN','Deep LSTM','Naive-Baseline',]
scores=[val_rnn_score,val_lstm_score,(100-MAPE_base_line)] 
validation_score['validation_score']=[f"{num:.1f}" for num in scores]
validation_score['validation_score']=validation_score['validation_score'].astype(float)

validation_score.info()


In [None]:

fig = plt.figure(figsize = (10, 5))
#  Bar plot
plt.bar(validation_score['Models'], validation_score['validation_score'], color ='maroon',
        width = 0.5)
matplotlib.rcParams.update({'font.size': 16})
for x,y in zip(validation_score['Models'], validation_score['validation_score']):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.xlabel("Model")
plt.ylabel("Model Score")
plt.title("Model Score After Hyper Parameter tuning:\n100-(Mean Absolute Percentage Error)\n")
plt.show()

#Prediction on Test Set

prediction

In [None]:
def unified_train_and_val(train,train_x,train_y,val,val_x,val_y):
  train=np.concatenate((train,val),axis=0)
  train_x=np.concatenate((train_x,val_x))
  train_y=np.concatenate((train_y,val_y))
  # train=np.concatenate((train,val),axis=0)
  # train_x=np.concatenate((train_x,val_x),axis=0)
  # train_y=np.concatenate((train_y,val_y),axis=0)


  return train,train_x,train_y



In [None]:
to_initial_test_row=train.iloc[-1] #train_unified[-1]
to_initial_test_row

In [None]:
#Create frame to test data:
test = prepare_data(test, num_of_lag_differences)
test=test.drop(['AverageTemperature'],axis=1)

In [None]:
def Auto_regressive_prediction(model,test):
  test_y=test.iloc[:,-1]
  test_x_ff=test.iloc[0,]

  columns=[test_x_ff.index]
  index=2

  #create prediction data set. i will fill this set sample after sample while i'll base my prediction calculaion on prev. predicted temperature


  '''                                          FIRST TEST RECORD
                                            -----------------------'''
  #fill first row based on known train data
  for i in range(2,len(test_x_ff)-1):
    test_x_ff[i]=to_initial_test_row[index+1]
    index=index+1
  print("new test_x:",test_x_ff)
  sample= test_x_ff.to_numpy().reshape(1,-1)
  sample=scaler.transform(sample)
  x=sample[:,:-1]
  sample_rss=x.reshape(1,1,14) #reshape to fit RNN prediction

  #predict of the 1st sample:
  perdict_first=model.predict(sample_rss)
  print(perdict_first)
  print(perdict_first.shape)

  # Reshape sample for inverse transform and visualization
  sample = sample_rss.copy().reshape((sample_rss.shape[0], sample_rss.shape[2]))
  test_new=np.append(sample,perdict_first, axis=1) ##add the prediction before inverse transform
  test_new= scaler.inverse_transform(test_new)
  test_new=pd.DataFrame(test_new,columns=columns)
  test_new=test_new.rename(columns={'Target': 'Prediction'})
  columns=[test_new.columns.get_level_values(0)] #need to flat the columns because for some reason it appeared to be MultiIndex which cause a problem to append later with new samples


  '''                                           ALL OTHER RECORDS
                                            -----------------------'''
  remain_test=test.iloc[1:,:]
  for i in range(len(remain_test)):
    sample=remain_test.iloc[i,:]
    for column in range (2,len(sample)-1):
      sample[column]=test_new.iloc[-1,column+1]
    sample= sample.to_numpy().reshape(1,-1)
    sample=scaler.transform(sample)
    x=sample[:,:-1]
    sample_rss=x.reshape(1,1,14) #reshape to fit RNN prediction
    perdict=model.predict(sample_rss)

    # Reshape sample for inverse transform and visualization
    sample = sample_rss.copy().reshape((sample_rss.shape[0], sample_rss.shape[2]))
    new=np.append(sample,perdict, axis=1) ##add the prediction before inverse transform
    new= scaler.inverse_transform(new)

    df1 = pd.DataFrame(new,columns=columns,index=[i+1])
    test_new=pd.concat([test_new, df1])

  pred = pd.DataFrame()
  pred['Original'] = test_y.values
  pred['Predicted'] = test_new.iloc[:,-1]
  pred['dt'] = test.index[-test.shape[0]:]
  # Calculate the error 
  pred['Error'] = np.abs(pred['Original'] - pred['Predicted'])
  pred['APE']=(pred['Error']/pred['Original'])*100

  MAPE=np.mean(pred['APE'])
  return pred,MAPE




In [None]:
pred_rnn, MAPE_rnn = Auto_regressive_prediction(model_RNN,test)
pred_lstm, MAPE_lstm = Auto_regressive_prediction(model_LSTM,test)



In [None]:
def plot_prediction(df1,title1,df2,title2,df3,title3,x_axes='Date'):#,df4,title4
  from plotly.subplots import make_subplots
  fig = make_subplots(rows=1, cols=2,subplot_titles=('Prediction vs. Actual: ','Mean Absolute Precentage Error: '))

  if 'dt' in df1.columns:
    x = df1['dt']
  else:
    x = df1.index
  fig.add_scatter(x = x, y = df1['Original'], name = 'Original', row=1, col=1,mode='lines+markers', line=dict(color="black"),legendgroup = '1')
  fig.add_scatter(x = x, y = df1['Predicted'], name = 'Predicted: '+title1, row=1, col=1,opacity = 0.5,mode='lines+markers', line=dict(color='purple'),legendgroup = '1')
  fig.add_scatter(x = x, y = df2['Predicted'], name = 'Predicted: '+title2, row=1, col=1,mode='lines+markers', line=dict(color='blue'),legendgroup = '1')
  fig.add_scatter(x = x, y = df3['Predicted'], name = 'Predicted: '+title3, row=1, col=1,opacity = 0.5,mode='lines+markers', line=dict(color="orange"),legendgroup = '1')
  # fig.add_scatter(x = x, y = df4['Predicted'], name = 'Predicted: '+title4, row=1, col=1,opacity = 0.5,mode='lines+markers', line=dict(color="pink"),legendgroup = '1')


  fig.add_scatter(x = x, y = df1['APE'], name = 'MAPE-'+title1, opacity = 0.5,row=1, col=2,mode='lines+markers', line=dict(color='purple'),legendgroup = '2')
  fig.add_scatter(x = x, y = df2['APE'], name = 'MAPE-'+title2, row=1, col=2,mode='lines+markers', line=dict(color='blue'),legendgroup = '2')
  fig.add_scatter(x = x, y = df3['APE'], name = 'MAPE-'+title3, opacity = 0.5,row=1, col=2,mode='lines+markers', line=dict(color="orange"),legendgroup = '2')
  # fig.add_scatter(x = x, y = df4['APE'], name = 'MAPE-'+title4, opacity = 0.5,row=1, col=2,mode='lines+markers', line=dict(color="pink"),legendgroup = '2')


  fig.update_xaxes(title_text=x_axes)
  fig.update_layout(height=500,legend_tracegroupgap = 180)
  fig.show()

# ASSESS MODEL PERFORMANCE (GLOBAL DATA)

In [None]:
test_score=pd.DataFrame()
models=['Deep RNN','Deep LSTM','Naive-Baseline']
test_score['Models']=['Deep RNN','Deep LSTM','Naive-Baseline']
scores=[(100-MAPE_rnn),(100-MAPE_lstm),(100-MAPE_base_line)]
test_score['test_score']=[f"{num:.1f}" for num in scores]
test_score['test_score']=test_score['test_score'].astype(float)
test_score

In [None]:
fig = plt.figure(figsize = (10, 5))
#  Bar plot
plt.bar(test_score['Models'], test_score['test_score'], color ='blue',
        width = 0.5)
matplotlib.rcParams.update({'font.size': 16})

for x,y in zip(test_score['Models'], test_score['test_score']):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.xlabel("Model")
plt.ylabel("Model Score")
plt.title("Final Model Score- test set: 100-(Mean Absolute Percentage Error)\n")
plt.show()

In [None]:
print('MAPE DEEP RNN:',MAPE_rnn)
print('MAPE DEEP LSTM:',MAPE_lstm)
print('MAPE base_line:',MAPE_base_line)
# print('MAPE Random Forest:',MAPE_rf)
print('\n\nAccuracy RNN:',100-MAPE_rnn)
print('Accuracy LSTM:',100-MAPE_lstm)
print('Accuracy base_line:',100-MAPE_base_line)
# print('Accuracy Random Forest:',100-MAPE_rf)

In [None]:
def PredictionGrouedby_MonthYear(df):
  #year:
  df['Year']=df['dt'].apply(lambda x: x.split('-')[0])
  df['Year']=df['Year'].astype(int)

  #month:
  df['Month']=df['dt'].apply(lambda x: x.split('-')[1])
  df['Month']=df['Month'].astype(int)

  prediction_per_year=df.groupby(['Year']).mean()
  prediction_per_year=prediction_per_year.drop(['Month'],axis=1)


  prediction_per_month=df.groupby(['Month']).mean()
  prediction_per_month=prediction_per_month.drop(['Year'],axis=1)
  print(prediction_per_month)

  return prediction_per_year,prediction_per_month



In [None]:
pred_rnn_by_year,pred_rnn_by_month=PredictionGrouedby_MonthYear(pred_rnn)
pred_lstm_by_year,pred_lstm_by_month=PredictionGrouedby_MonthYear(pred_lstm)
pred_base_line_by_year,pred_base_line_by_month=PredictionGrouedby_MonthYear(pred_base_line)
# pred_rf_by_year,pred_rf_by_month=PredictionGrouedby_MonthYear(pred_rf)

In [None]:
pred_rnn_by_month

In [None]:
print('MAPE-RNN model: ',MAPE_rnn,'\nMAPE-LSTM model: ',MAPE_lstm,'\nMAPE-Base-Line- Persistence model ',MAPE_base_line)
plot_prediction(pred_rnn_by_year,"RNN model",pred_lstm_by_year,"LSTM model",pred_base_line_by_year,"Base-Line- Persistence model",x_axes='Year') #,pred_rf_by_year,"RF model"
plot_prediction(pred_rnn_by_month,"RNN model",pred_lstm_by_month,"LSTM model",pred_base_line_by_month,"Base-Line- Persistence model:",x_axes='Month') #,pred_rf_by_month,"RF model"


In [None]:
fig = make_subplots(rows=1, cols=1,subplot_titles=('Mean Absolute Precentage Error: '))
if 'dt' in pred_rnn.columns:
  x = pred_rnn['dt']
else:
  x = pred_rnn.index
# fig.add_scatter(x = x, y = pred_rnn['Original'], name = 'Original', row=1, col=2,mode='lines+markers', line=dict(color="black"),legendgroup = '1')
# fig.add_scatter(x = x, y = pred_rnn['Predicted'], name = 'Predicted- RNN: ', row=1, col=2,opacity = 0.5,mode='lines+markers', line=dict(color='purple'),legendgroup = '1')
# fig.add_scatter(x = x, y = pred_lstm['Predicted'], name = 'Predicted-LSTM ', row=1, col=2,mode='lines+markers', line=dict(color='blue'),legendgroup = '1')
# fig.add_scatter(x = x, y = pred_base_line['Predicted'], name = 'Predicted-Base-Line- Persistence model ', row=1, col=2,opacity = 0.5,mode='lines+markers', line=dict(color="orange"),legendgroup = '1')
# fig.add_scatter(x = x, y = pred_rf['Predicted'], name = 'Predicted-Base-Line- Persistence model ', row=1, col=2,opacity = 0.5,mode='lines+markers', line=dict(color="pink"),legendgroup = '1')

fig.add_scatter(x = x, y = pred_base_line['APE'], name = 'MAPE-Base-Line- Persistence model', opacity = 0.5,row=1, col=1,mode='lines+markers', line=dict(color="orange"),legendgroup = '2')
# fig.add_scatter(x = x, y = pred_rf['APE'], name = 'MAPE-Base-Line- RF',row=1, col=1,mode='lines+markers', line=dict(color="red"),legendgroup = '2')
fig.add_scatter(x = x, y = pred_rnn['APE'], name = 'MAPE-RNN', row=1, col=1,mode='lines+markers', line=dict(color='purple'),legendgroup = '2')
fig.add_scatter(x = x, y = pred_lstm['APE'], name = 'MAPE- LSTM', row=1, col=1,mode='lines+markers', line=dict(color='blue'),legendgroup = '2')




fig.update_layout(height=700,legend_tracegroupgap = 180)
fig.show()

