# Prepare labelled input for the NN
# (i.e. locations where moss&lichen fractional cover changed and related meteorological parameters from ERA5-Land)

# Copernicus Global Land Cover data  from 2015-01-01 to 2019-12-31 already available as a netCDF file stored on EOSC (CESNET)
## Troms og Finnmark
### Mosses and lichens, bare, grass, shrubs and trees

In [None]:
!date

In [None]:
pip install vaex

### Define s3 storage parameters

In [None]:
import s3fs
import xarray as xr
import pandas as pd
import h3
import vaex

In [None]:
client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'}
store = s3fs.S3FileSystem(anon=False, client_kwargs=client_kwargs)
store.ls('Data', detail=True, refresh=True)

### Define s3 store for the **netCDF file**

In [None]:
s3path = 'Data/C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc'

In [None]:
GLC_AOI = xr.open_dataset(store.open(s3path))

In [None]:
GLC_AOI

In [None]:
GLC_AOI = GLC_AOI.rename(x='lon', y='lat', t='time')

In [None]:
# Drop variables not directly of interest here
GLC_AOI = GLC_AOI.drop_vars(['crs',
                             'Crops_CoverFraction_layer',
                             'Discrete_Classification_map', 
                             'Discrete_Classification_proba',
                             'Forest_Type_layer',
                             'Snow_CoverFraction_layer',
                             'BuiltUp_CoverFraction_layer',
                             'PermanentWater_CoverFraction_layer',
                             'SeasonalWater_CoverFraction_layer',
                             'DataDensityIndicator',
                             'Change_Confidence_layer',
                             'dataMask'])

In [None]:
GLC_AOI = GLC_AOI.rename(Bare_CoverFraction_layer = 'Bare',
                         Grass_CoverFraction_layer = 'Grass',
                         MossLichen_CoverFraction_layer = 'Lichen',
                         Shrub_CoverFraction_layer = 'Shrub',
                         Tree_CoverFraction_layer = 'Tree')

In [None]:
GLC_AOI

In [None]:
# Troms & Finnmark Global Land Cover area
GLC_AOI_min_lon = GLC_AOI.lon.min()
GLC_AOI_max_lon = GLC_AOI.lon.max()
GLC_AOI_min_lat = GLC_AOI.lat.min()
GLC_AOI_max_lat = GLC_AOI.lat.max()
print(GLC_AOI_min_lon, GLC_AOI_max_lon, GLC_AOI_min_lat, GLC_AOI_max_lat)

### The two cells below redefine a very small region for testing purposes only - skip them to keep the whole Troms-finnmark area

In [None]:
# Small region 
Small_AOI_min_lon = 19.65
Small_AOI_max_lon = 19.7
Small_AOI_min_lat = 69.05
Small_AOI_max_lat = 69.1

In [None]:
#df = GLC_AOI.isel(time = 0).sel(lat=slice(Small_AOI_max_lat, Small_AOI_min_lat), lon=slice(Small_AOI_min_lon, Small_AOI_max_lon)).to_dataframe()

In [None]:
from h3 import h3
import folium

def visualize_hexagons(hexagons, color="red", folium_map=None):
    """
    hexagons is a list of hexcluster. Each hexcluster is a list of hexagons. 
    eg. [[hex1, hex2], [hex3, hex4]]
    """
    polylines = []
    lat = []
    lng = []
    for hex in hexagons:
        polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False)
        # flatten polygons into loops.
        outlines = [loop for polygon in polygons for loop in polygon]
        polyline = [outline + [outline[0]] for outline in outlines][0]
        lat.extend(map(lambda v:v[0],polyline))
        lng.extend(map(lambda v:v[1],polyline))
        polylines.append(polyline)
    
    if folium_map is None:
        m = folium.Map(location=[sum(lat)/len(lat), sum(lng)/len(lng)], zoom_start=13, tiles='cartodbpositron')
    else:
        m = folium_map
    for polyline in polylines:
        my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color=color)
        m.add_child(my_PolyLine)
    return m
    

def visualize_polygon(polyline, color):
    polyline.append(polyline[0])
    lat = [p[0] for p in polyline]
    lng = [p[1] for p in polyline]
    m = folium.Map(location=[sum(lat)/len(lat), sum(lng)/len(lng)], zoom_start=13, tiles='cartodbpositron')
    my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color=color)
    m.add_child(my_PolyLine)
    return m

In [None]:
Small_AOI_center = h3.geo_to_h3((Small_AOI_min_lat + Small_AOI_max_lat)/2, (Small_AOI_min_lon +Small_AOI_max_lon)/2, 9) # lat, lng, hex resolution    
m = visualize_hexagons([h3.geo_to_h3(GLC_AOI_min_lat, GLC_AOI_min_lon, 9)], color="red")                                                                                        
m = visualize_hexagons([h3.geo_to_h3(GLC_AOI_max_lat, GLC_AOI_min_lon, 9)], color="red", folium_map=m)                                                                                   
m = visualize_hexagons([h3.geo_to_h3(GLC_AOI_max_lat, GLC_AOI_max_lon, 9)], color="red", folium_map=m)                                                                                
m = visualize_hexagons([h3.geo_to_h3(GLC_AOI_min_lat, GLC_AOI_max_lon, 9)], color="red", folium_map=m)
#m = visualize_hexagons([Small_AOI_center], color="green", folium_map=m) 
display(m)

In [None]:
GLC_AOI

In [None]:
de = GLC_AOI.to_dataframe()

In [None]:
de = de.reset_index()

In [None]:
de

In [None]:
# Only keep the locations where there is lichen
dd = de.loc[(de['Lichen'] > 0) & (de['Lichen'] <= 100)]

In [None]:
dd

## Each year in a separate dataset

In [None]:
Year = 2016
Month_start = 1
Day_start = 1
Month_end = 6
Day_end = 30
print('x = WLC(' + str(Year)+ ') joined with ERA5land(' + str(Year) + '-' + str(Month_start) + '-' + str(Day_start) + '/' + str(Year)  + '-' + str(Month_end) + '-' + str(Day_end) + ')')
print('y = WLC(' + str(Year + 1) + ')')

In [None]:
# Only keep locations where there is some moss & lichen for the current year
df = dd.loc[de['time'] == str(Year) + '-01-01']
dg = dd.loc[dd['time'] == str(Year + 1) + '-01-01']

In [None]:
df

In [None]:
# Replace NaNs by 0
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    df[col] = df[col].fillna(0)
    dg[col] = dg[col].fillna(0)

In [None]:
# Calculate total fractional coverage of bare, grass, lichen, shrub and tree (should be 100)
df['Total']  = (df['Bare'] + df['Grass'] + df['Lichen'] + df['Shrub'] + df['Tree'])
dg['Total']  = (dg['Bare'] + dg['Grass'] + dg['Lichen'] + dg['Shrub'] + dg['Tree'])

In [None]:
df

In [None]:
# Normalize the fractional cover
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    df[col] = df[col] / df['Total']
    dg[col] = dg[col] / dg['Total']

In [None]:
# Drop the *Total* column
df = df.drop(['Total'], axis=1)
dg = dg.drop(['Total'], axis=1)

In [None]:
df

In [None]:
# Convert to VAEX
dvx = vaex.from_pandas(df)
dvy = vaex.from_pandas(dg)

In [None]:
dvx

In [None]:
# Find the correspondind ERA5-land lat-lon
# Careful with the latitude, in reverse order
dvx['ERA5_lon_index'] = ((dvx.lon - 15.59) / 0.1).astype('int').values
dvx['ERA5_lat_index'] = 28 - ((dvx.lat - 68.35) / 0.1).astype('int').values
dvy['ERA5_lon_index'] = ((dvy.lon - 15.59) / 0.1).astype('int').values
dvy['ERA5_lat_index'] = 28 - ((dvy.lat - 68.35) / 0.1).astype('int').values

In [None]:
dvx

# ERA5-land data from 2015-01-01 to 2019-12-31 - already available as a netCDF file stored on EOSC (CESNET)
## 2m Temperature, Snow depth, Total precipitation
## **For now will only use t2p in the ML algorithm** although it may be useful to know about rain and snow depth

In [None]:
s3path = 'Data/reanalysis-era5-land_hourly_2015-01-01_2019-12-31_Troms-Finnmark_T2m-SD-TP.nc'

In [None]:
ERA5land = xr.open_dataset(store.open(s3path))

In [None]:
ERA5land

# Adding columns with the ERA5-land longitude and latitude to dv

In [None]:
dvx['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvx['ERA5_lon_index'].values].values
dvx['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvx['ERA5_lat_index'].values].values
dvy['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvy['ERA5_lon_index'].values].values
dvy['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvy['ERA5_lat_index'].values].values

In [None]:
dvx

In [None]:
import numpy as np

In [None]:
#ERA5land.sel(time="2015-01-01").where(ERA5land["latitude"].isin(dv['ERA5_lat'].values) & ERA5land["longitude"].isin(dv['ERA5_lon'].values))["t2m"].isel(time=0).plot()

In [None]:
#ERA5land.sel(time="2015-03-01").where(ERA5land["latitude"].isin(dv['ERA5_lat'].values) & ERA5land["longitude"].isin(dv['ERA5_lon'].values))["t2m"].isel(time=0).plot()

## Extract ERA5 data for  the selected period of the year (when RoS events mostly occur)

In [None]:
ERA5land = ERA5land.sel(time=slice(str(Year + 1) + '-' + str(Month_start) + '-' + str(Day_start), str(Year + 1)  + '-' + str(Month_end) + '-' + str(Day_end)))

In [None]:
ERA5land

In [None]:
# Extract ERA5 't2m' field 
ERA5 = ERA5land.where(ERA5land['latitude'].isin(dvx['ERA5_lat'].values) & ERA5land['longitude'].isin(dvx['ERA5_lon'].values))['t2m']

In [None]:
# Calculate the first time using the 2015 values
t2m_mean = ERA5.mean(skipna=True).values
print('Mean of the ERA5-Land 2m temperature: ', t2m_mean)
t2m_std = ERA5.std(skipna=True).values
print('Standard deviation of the ERA5-Land 2m temperature: ', t2m_std)
# Set once and for all
t2m_mean = 267.1025
t2m_std = 14.740288734436035

In [None]:
# Normalize temperature values
ERA5 = (ERA5 -t2m_mean) / t2m_std

In [None]:
ERA5.shape[0]

In [None]:
df = ERA5.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()

In [None]:
df

In [None]:
# Add combined lon_lat column to df
df['lon_lat'] = (df['longitude'] * 100).astype('int') + (df['latitude'] * 100).astype('int') / 100000

In [None]:
# Drop latitude and longitude columns which are not used anymore in df
df = df.drop(columns=['latitude', 'longitude'])

In [None]:
df

In [None]:
# Add combined lon_lat column to dv x & y
dvx['lon_lat'] = (dvx['ERA5_lon'] * 100).astype('int') + (dvx['ERA5_lat'] * 100).astype('int') / 100000

In [None]:
dvx

In [None]:
# Drop unused columns in dv x & y
dwx = dvx.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])
dwy = dvy.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])

In [None]:
dwx

In [None]:
# Convert to panda dw x & y
dwx_pandas = dwx.to_pandas_df()
dwy_pandas = dwy.to_pandas_df()

## Join dwx (WLC) with df (ERA5 t2m)

In [None]:
dwx_pandas

In [None]:
# With ERA5_t2m
dx = dwx_pandas.set_index('lon_lat').join(df.set_index('lon_lat'), on='lon_lat')

In [None]:
dx

In [None]:
# Drop the Rows with NaN Values
dx = dx.dropna()

In [None]:
dx = dx.reset_index()

In [None]:
dx = dx.drop(columns=['lon_lat'])

In [None]:
dx

## Save into **local** CSV file with header and indices

In [None]:
dx.to_csv(r'/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/x_' + str(Year) + '.csv', header=True, index=True, sep=',', mode='a')

# Local .CSV files with header & index - t2m normalized
## x_2015.csv **139765** rows and **6G** 
## x_2016.csv *212459* rows and **
## x_2017.csv *227807* rows and *11G*
## x_2018.csv *211791* rows and *9.5G*
## x_2019.csv *289371* rows and *13G*

## Find locations with lichen in Year+1 corresponding to those in Year

In [None]:
dwx_pandas = dx[['lon', 'lat']]

In [None]:
dwx_pandas

In [None]:
dwy_pandas

In [None]:
# Add combined lat-lon column to dv x & y
dwx_pandas['lon_lat'] = (dwx_pandas['lon'] * 100000).astype('int') + (dwx_pandas['lat'] * 100000).astype('int') / 10000000
dwy_pandas['lon_lat'] = (dwy_pandas['lon'] * 100000).astype('int') + (dwy_pandas['lat'] * 100000).astype('int') / 10000000

In [None]:
dwx_pandas = dwx_pandas.drop(columns=['lon', 'lat'])
dwy_pandas = dwy_pandas.drop(columns=['lon', 'lat'])

In [None]:
dwx_pandas

In [None]:
dwy_pandas

In [None]:
## Join dwx with dwy
dy = dwx_pandas.set_index('lon_lat').join(dwy_pandas.set_index('lon_lat'), on='lon_lat')

In [None]:
dy

In [None]:
# Replace NaNs by 0
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    dy[col] = dy[col].fillna(0)

In [None]:
dy = dy.reset_index().drop(columns=['lon_lat'])

In [None]:
dy

In [None]:
dy.to_csv(r'/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/y_' + str(Year) + '.csv', header=True, index=True, sep=',', mode='a')