In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import regionmask
import cartopy.crs as ccrs
import cartopy.feature as cfeature

from sklearn import svm
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# %matplotlib inline

# STEP 1: Let's ingest (and inspect) the tourism data

**Source:** https://data.europa.eu/euodp/en/data/dataset/dKveFoy4vFts8fta4Q4SIw

**Description:**
An arrival is defined as a person (tourist) who arrives at a tourist accommodation establishment and checks in. The arrivals of same-day visitors spending only a few hours during the day (no overnight stay, the date of arrival and departure are the same) at the establishment are excluded.

#### In the code below, you can change:
`country_code1`

`country_code2` 
#### to any 2 european countries available in the dataset. Explanation for country codes in the link:
https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes

In [None]:
# read tsv file
tourism_df = pd.read_csv('./data/tourism/tour_occ_arm.tsv', sep='\t', na_values=[': '])

# change column format/type to Timestamps (easier manipulation)
new_columns = [col if i ==0 else pd.to_datetime(col, format='%YM%m ') for i,col in enumerate(tourism_df.columns)]
tourism_df.columns = new_columns
# removing some characters from data
tourism_df = tourism_df.replace(to_replace=['b','c','d','e'], value='', regex=True)
tourism_df.head()

In [None]:
# extracting data from a specific country, by country code
country_code1 = 'AT'
country1_tourism_df = tourism_df[tourism_df.iloc[:, 0] == f'TOTAL,NR,I551,{country_code1}']
# Transposing data so Timestamps go in index
country1_tourism_df = country1_tourism_df.T[1:][-1:0:-1].astype(int)
country1_tourism_df.columns = [f'Arrivals,{country_code1}']

# extracting data from a specific country, by country code
country_code2 = 'PT'
country2_tourism_df = tourism_df[tourism_df.iloc[:, 0] == f'TOTAL,NR,I551,{country_code2}']
# Transposing data so Timestamps go in index
country2_tourism_df = country2_tourism_df.T[1:][-1:0:-1].astype(int)
country2_tourism_df.columns = [f'Arrivals,{country_code2}']

In [None]:
country1_tourism_df.head()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(18,8))
plt.plot(country1_tourism_df.index, country1_tourism_df.values, label=country_code1)
plt.plot(country2_tourism_df.index, country2_tourism_df.values, label=country_code2)
plt.legend()
plt.title('Arrivals at tourist accommodation establishments - monthly data')
plt.show()

In [None]:
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

country1_ave12 = moving_average(country1_tourism_df.values.reshape(-1), 12)
country1_ave6 = moving_average(country1_tourism_df.values.reshape(-1), 6)

plt.figure(figsize=(18,8))
plt.plot(country1_tourism_df.values, label='monthly data')
plt.plot(country1_ave12, label='moving average 12 months')
plt.plot(country1_ave6, label='moving average 6 months')

plt.legend()
plt.title(f'Arrivals at tourist accommodation establishments - {country_code1}')
plt.show()

In [None]:
country2_ave12 = moving_average(country2_tourism_df.values.reshape(-1), 12)
country2_ave6 = moving_average(country2_tourism_df.values.reshape(-1), 6)

plt.figure(figsize=(18,8))
plt.plot(country2_tourism_df.values, label='monthly data')
plt.plot(country2_ave12, label='moving average 12 months')
plt.plot(country2_ave6, label='moving average 6 months')

plt.legend()
plt.title(f'Arrivals at tourist accommodation establishments - {country_code2}')
plt.show()

### *Can you get any interesting insight from the data for the countries in your selection?*

# STEP 2: Let's ingest (and inspect) the weather data

- For the training of the models, we will use ERA5 Reanalysis data. 

- We will use previously downloaded monthly data, so we have the same time granularity as the tourism arrivals data

- We will explore t2m and tp variables, but you may have more data than that depending on what you downloaded


In [None]:
wx_data = xr.open_dataset('./data/weather/era5_EU_monthly.nc')
plt.figure(figsize=(12,8))
data_crs = ccrs.PlateCarree()

ax = plt.axes(projection=data_crs)
wx_data.t2m.sel(time='2018-06').plot(ax=ax,transform=data_crs, cmap='gist_ncar')
plt.show()

In [None]:
# define a country_mask array so we can get data for any EU country 
# this might take some time (~30secs)
country_mask = regionmask.defined_regions.natural_earth.countries_50.mask(wx_data, 
                                                                          lon_name='longitude', 
                                                                          lat_name='latitude')

In [None]:
def extract_data_for_country(country_name, country_mask, wx_data):
    country_id = regionmask.defined_regions.natural_earth.countries_50.map_keys(country_name)
    wx_data = wx_data.where(country_mask==country_id)
    wx_data = wx_data.dropna('latitude', how='all')
    wx_data = wx_data.dropna('longitude', how='all')
    return wx_data
    

In [None]:
# this might also take some time, but less (~8secs)
country1_wx = extract_data_for_country('Austria', country_mask, wx_data)
country2_wx = extract_data_for_country('Portugal', country_mask, wx_data)

In [None]:
# Available country codes, for reference
regionmask.defined_regions.natural_earth.countries_50.map_keys

In [None]:
plt.figure(figsize=(12,8))
ax = plt.axes(projection=ccrs.Mercator())
p = country1_wx.t2m.sel(time='2018-06').plot(ax=ax, transform=ccrs.PlateCarree(), cmap='Reds')

In [None]:
plt.figure(figsize=(12,8))
ax = plt.axes(projection=ccrs.Mercator())
p = country2_wx.tp.sel(time='2018-06').plot(ax=ax, transform=ccrs.PlateCarree(), cmap='jet')

# Can we find interesting correlations between Weather x Arrivals data

In [None]:
fig = plt.figure(figsize=(18,8))
ax = fig.add_subplot(111)
ax2 = ax.twinx()

ax.plot(country1_tourism_df.values, 
        label='Arrivals', 
        color='blue')

ax2.bar(range(country1_wx.dims['time']),
        country1_wx.tp.mean(axis=(1,2))*1000, 
        label='Precipitation', color='red')

ax2.set_ylim(0,30)

ax.set_ylabel('Arrivals at tourist accommodations')
ax2.set_ylabel('mm')

ax.grid(False)
ax2.grid(False)

fig.legend(loc="upper right")
plt.title(f'{country_code1} Precipitation x Tourism data')
plt.show()

In [None]:
# wx_data.time, 
country1_tourism_df

In [None]:
fig = plt.figure(figsize=(18,8))
ax = fig.add_subplot(111)
ax2 = ax.twinx()

ax.plot(country1_tourism_df.values, 
        label='Arrivals', 
        color='blue')

ax2.plot(country1_wx.t2m.mean(axis=(1,2))-273.15, 
         label='Temperature', 
         color='red')

ax2.set_ylim(-30,50)
ax.set_ylabel('Arrivals at tourist accommodations')
ax2.set_ylabel('degC')

ax.grid(False)
ax2.grid(False)

fig.legend(loc="upper right")
plt.title(f'{country_code1} Temperature x Tourism data')
plt.show()

# STEP 3: Let's do some modelling

## Important considerations:
    - Training/Test Split
    - Features to be used
    - Algorithm to choose
    - Always check your data

In [None]:
train_period = ('2000-1-1','2015-1-1')
test_period = ('2015-1-1','2019-1-1')

In [None]:
# slicing according to train_period
X_train = country1_wx.t2m.max(axis=(1,2)).sel(time=slice(train_period[0],train_period[1])).values.reshape(-1, 1)
Y_train = country1_tourism_df[pd.Timestamp(train_period[0]):pd.Timestamp(train_period[1])].values.reshape(-1,1).astype(float)

# slicing according to test_period
X_test = country1_wx.t2m.max(axis=(1,2)).sel(time=slice(test_period[0],test_period[1])).values.reshape(-1, 1)
Y_test = country1_tourism_df[pd.Timestamp(test_period[0]):pd.Timestamp(test_period[1])].values.reshape(-1,1).astype(float)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
svr = svm.SVR(kernel='poly', gamma='scale')

svr.fit(X_train, Y_train.reshape(-1))

Y_pred_svr = svr.predict(X_test)

svr_r2 = metrics.r2_score(Y_test, Y_pred_svr)
svr_mae = metrics.mean_absolute_error(Y_test, Y_pred_svr)
svr_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_svr))

print(f' Support Vector Regression R2   = {svr_r2:17.6f}')
print(f' Support Vector Regression MAE  = {svr_mae:17.6f}')
print(f' Support Vector Regression RMSE = {svr_rmse:17.6f}')

In [None]:
rf = RandomForestRegressor(n_estimators=100)

rf.fit(X_train, Y_train.reshape(-1))

Y_pred_rf = rf.predict(X_test)

rf_r2 = metrics.r2_score(Y_test, Y_pred_rf)
rf_mae = metrics.mean_absolute_error(Y_test, Y_pred_rf)
rf_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_rf))

print(f' Random Forest Regression R2   = {rf_r2:17.6f}')
print(f' Random Forest Regression MAE  = {rf_mae:17.6f}')
print(f' Random Forest Regression RMSE = {rf_rmse:17.6f}')

In [None]:
gb = GradientBoostingRegressor()

gb.fit(X_train, Y_train.reshape(-1))

Y_pred_gb = gb.predict(X_test)

gb_r2 = metrics.r2_score(Y_test, Y_pred_gb)
gb_mae = metrics.mean_absolute_error(Y_test, Y_pred_gb)
gb_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_gb))

print(f' Gradient Boosting Regression R2   = {gb_r2:17.6f}')
print(f' Gradient Boosting Regression MAE  = {gb_mae:17.6f}')
print(f' Gradient Boosting Regression RMSE = {gb_rmse:17.6f}')

### Let's visualize these predictions

In [None]:
plt.figure(figsize=(16,10))
train_index = pd.date_range(train_period[0],train_period[1], freq='1M')
test_index = pd.date_range(test_period[0],test_period[1], freq='1M')
plt.plot(train_index,Y_train[:-1], label='Training', color='black', lw=2)
plt.plot(test_index,Y_test[:-1], label='Test', lw=2)
plt.plot(test_index,Y_pred_svr[:-1], label= 'SVR', lw=3)
plt.plot(test_index,Y_pred_rf[:-1], label= 'RF', lw=3)
plt.plot(test_index,Y_pred_gb[:-1], label= 'GB', lw=2)
plt.legend()
plt.title(f'Model Results for {country_code1} - 1st Version')
plt.show()

# Why so bad?

- Our predictors only explain the data variability in a short timescale (essentially: winter v summer)
- We need another feature that is capable of adding information about longer timescales to the model, so the model can learn 
- This can surprise you


In [None]:
# adding month number as a feature
month_train_array = np.arange(0, 
                              len(pd.date_range(train_period[0],train_period[1], freq='1M'))+1)
month_test_array = np.arange(month_train_array[-1]+1, 
                             month_train_array[-1]+len(pd.date_range(test_period[0],test_period[1], freq='1M'))+2)

In [None]:
X_train_2 = np.column_stack([X_train[:,0], month_train_array])
X_test_2 = np.column_stack([X_test[:,0], month_test_array])

X_train_2.shape, X_test_2.shape, Y_train.shape, Y_test.shape

In [None]:
svr = svm.SVR(kernel='poly', gamma='scale')

svr.fit(X_train_2, Y_train.reshape(-1))

Y_pred_svr = svr.predict(X_test_2)

svr_r2 = metrics.r2_score(Y_test, Y_pred_svr)
svr_mae = metrics.mean_absolute_error(Y_test, Y_pred_svr)
svr_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_svr))

print(f' Support Vector Regression R2   = {svr_r2:17.6f}')
print(f' Support Vector Regression MAE  = {svr_mae:17.6f}')
print(f' Support Vector Regression RMSE = {svr_rmse:17.6f}')


In [None]:
rf = RandomForestRegressor(n_estimators=100)

rf.fit(X_train_2, Y_train.reshape(-1))

Y_pred_rf = rf.predict(X_test_2)

rf_r2 = metrics.r2_score(Y_test, Y_pred_rf)
rf_mae = metrics.mean_absolute_error(Y_test, Y_pred_rf)
rf_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_rf))

print(f' Random Forest Regression R2   = {rf_r2:17.6f}')
print(f' Random Forest Regression MAE  = {rf_mae:17.6f}')
print(f' Random Forest Regression RMSE = {rf_rmse:17.6f}')

In [None]:
gb = GradientBoostingRegressor()

gb.fit(X_train_2, Y_train.reshape(-1))
Y_pred_gb = gb.predict(X_test_2)

gb_r2 = metrics.r2_score(Y_test, Y_pred_gb)
gb_mae = metrics.mean_absolute_error(Y_test, Y_pred_gb)
gb_rmse = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_gb))

print(f' Gradient Boosting Regression R2   = {gb_r2:17.6f}')
print(f' Gradient Boosting Regression MAE  = {gb_mae:17.6f}')
print(f' Gradient Boosting Regression RMSE = {gb_rmse:17.6f}')

## These are much better results, let's visualize them

In [None]:
plt.figure(figsize=(16,10))
train_index = pd.date_range(train_period[0],train_period[1], freq='1M')
test_index = pd.date_range(test_period[0],test_period[1], freq='1M')
plt.plot(train_index,Y_train[:-1], label='Training', color='black', lw=2)
plt.plot(test_index,Y_test[:-1], label='Test', lw=2)
plt.plot(test_index,Y_pred_svr[:-1], label='SVR', lw=3)
plt.plot(test_index,Y_pred_rf[:-1], label= 'RF', lw=3)
plt.plot(test_index,Y_pred_gb[:-1], label= 'GB', lw=2)
plt.title(f'Model Results for {country_code1} - 2nd Version')
plt.legend()
plt.show()