# Prepare labelled input for the Machine Learning algorithm
# (i.e. locations where moss&lichen fractional cover changed and related meteorological parameters from ERA5-Land)

In [1]:
!date

Fri Mar 10 12:55:07 UTC 2023


In [2]:
pip install vaex

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import s3fs
import xarray as xr
import vaex

### Define s3 storage parameters

In [4]:
client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'}
store = s3fs.S3FileSystem(anon=False, client_kwargs=client_kwargs)
store.ls('Data', detail=True, refresh=True)

[{'Key': 'Data/C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc',
  'LastModified': datetime.datetime(2023, 2, 1, 12, 44, 42, 95000, tzinfo=tzlocal()),
  'ETag': '"70e848cfeba6b4e2db997b6efb0ad947-8"',
  'Size': 397191332,
  'StorageClass': 'STANDARD',
  'type': 'file',
  'size': 397191332,
  'name': 'Data/C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc'},
 {'Key': 'Data/reanalysis-era5-land_hourly_2015-01-01_2019-12-31_Troms-Finnmark_T2m-SD-TP.nc',
  'LastModified': datetime.datetime(2023, 2, 1, 9, 53, 15, 635000, tzinfo=tzlocal()),
  'ETag': '"e48be2b71e08b38d296a0ea6db979b09-23"',
  'Size': 1182124070,
  'StorageClass': 'STANDARD',
  'type': 'file',
  'size': 1182124070,
  'name': 'Data/reanalysis-era5-land_hourly_2015-01-01_2019-12-31_Troms-Finnmark_T2m-SD-TP.nc'},
 {'Key': 'Data/reanalysis-era5-land_hourly_2015-01-01_2022-12-31_Troms-Finnmark_T2m-SD-TP.nc',
  'LastModified': datetime.datetime(2023, 3, 10, 11, 50, 32, 549000, tzinfo=tzlocal()),
  'ETag': '"d082ee07a1ab33

# Copernicus Global Land Cover data  from 2015-01-01 to 2019-12-31 already available as a netCDF file stored on EOSC (CESNET)
## Troms og Finnmark
### Mosses and lichens, bare, grass, shrubs and trees

In [5]:
s3path = 'Data/C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc'

In [6]:
GLC_AOI = xr.open_dataset(store.open(s3path))

In [7]:
GLC_AOI

# ERA5-land data from 2015-01-01 to 2019-12-31 - already available as a netCDF file stored on EOSC (CESNET)
## 2m Temperature, Snow depth, Total precipitation

In [8]:
s3path = 'Data/reanalysis-era5-land_hourly_2015-01-01_2022-12-31_Troms-Finnmark_T2m-SD-TP.nc'

In [9]:
ERA5land = xr.open_dataset(store.open(s3path))

In [10]:
ERA5land

In [11]:
GLC_AOI = GLC_AOI.rename(x='lon', y='lat', t='time')

In [12]:
# Drop variables not directly of interest here
GLC_AOI = GLC_AOI.drop_vars(['crs',
                             'Crops_CoverFraction_layer',
                             'Discrete_Classification_map', 
                             'Discrete_Classification_proba',
                             'Forest_Type_layer',
                             'Snow_CoverFraction_layer',
                             'BuiltUp_CoverFraction_layer',
                             'PermanentWater_CoverFraction_layer',
                             'SeasonalWater_CoverFraction_layer',
                             'DataDensityIndicator',
                             'Change_Confidence_layer',
                             'dataMask'])

In [13]:
GLC_AOI = GLC_AOI.rename(Bare_CoverFraction_layer = 'Bare',
                         Grass_CoverFraction_layer = 'Grass',
                         MossLichen_CoverFraction_layer = 'Lichen',
                         Shrub_CoverFraction_layer = 'Shrub',
                         Tree_CoverFraction_layer = 'Tree')

In [14]:
GLC_AOI

In [15]:
# Troms & Finnmark Global Land Cover area
GLC_AOI_min_lon = GLC_AOI.lon.min()
GLC_AOI_max_lon = GLC_AOI.lon.max()
GLC_AOI_min_lat = GLC_AOI.lat.min()
GLC_AOI_max_lat = GLC_AOI.lat.max()
print(GLC_AOI_min_lon.values, GLC_AOI_max_lon.values, GLC_AOI_min_lat.values, GLC_AOI_max_lat.values)

15.595313502963002 31.06568387333461 68.35153627792579 71.18764738903712


In [16]:
de = GLC_AOI.to_dataframe()

In [17]:
de = de.reset_index()

In [18]:
de

Unnamed: 0,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree
0,2015-01-01,15.595314,71.187647,255.0,255.0,255.0,255.0,255.0
1,2015-01-01,15.595314,71.186721,255.0,255.0,255.0,255.0,255.0
2,2015-01-01,15.595314,71.185796,255.0,255.0,255.0,255.0,255.0
3,2015-01-01,15.595314,71.184870,255.0,255.0,255.0,255.0,255.0
4,2015-01-01,15.595314,71.183944,255.0,255.0,255.0,255.0,255.0
...,...,...,...,...,...,...,...,...
255981875,2019-01-01,31.065684,68.355240,,,,,
255981876,2019-01-01,31.065684,68.354314,,,,,
255981877,2019-01-01,31.065684,68.353388,,,,,
255981878,2019-01-01,31.065684,68.352462,,,,,


In [19]:
# Only keep the locations where there is lichen during the current year
dd = de[(de['Lichen'] > 0) & (de['Lichen'] <= 100)]

In [20]:
dd

Unnamed: 0,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree
2520,2015-01-01,15.595314,68.854314,,41.0,1.0,29.0,29.0
2823,2015-01-01,15.595314,68.573759,20.0,48.0,24.0,8.0,
2826,2015-01-01,15.595314,68.570981,23.0,58.0,19.0,,
2840,2015-01-01,15.595314,68.558018,7.0,58.0,25.0,10.0,
2885,2015-01-01,15.595314,68.516351,35.0,34.0,31.0,,
...,...,...,...,...,...,...,...,...
255977331,2019-01-01,31.064758,69.725610,,49.0,18.0,25.0,8.0
255977332,2019-01-01,31.064758,69.724684,,48.0,17.0,27.0,8.0
255977333,2019-01-01,31.064758,69.723759,,49.0,19.0,21.0,11.0
255977335,2019-01-01,31.064758,69.721907,,44.0,38.0,18.0,


## Each year in a separate dataset and keep only the first 183 days

In [21]:
Year = 2015
Number_of_days = 183
print('x = WLC(' + str(Year)+ ') joined with ERA5land(' + str(Year + 1) + ')')
print('y = WLC(' + str(Year + 1) + ')')

x = WLC(2015) joined with ERA5land(2016)
y = WLC(2016)


In [22]:
# Only keep locations for the current year
df = dd.loc[de['time'] == str(Year) + '-01-01']
dg = dd.loc[dd['time'] == str(Year + 1) + '-01-01']

In [23]:
df

Unnamed: 0,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree
2520,2015-01-01,15.595314,68.854314,,41.0,1.0,29.0,29.0
2823,2015-01-01,15.595314,68.573759,20.0,48.0,24.0,8.0,
2826,2015-01-01,15.595314,68.570981,23.0,58.0,19.0,,
2840,2015-01-01,15.595314,68.558018,7.0,58.0,25.0,10.0,
2885,2015-01-01,15.595314,68.516351,35.0,34.0,31.0,,
...,...,...,...,...,...,...,...,...
51191825,2015-01-01,31.064758,69.727462,,62.0,23.0,15.0,
51191827,2015-01-01,31.064758,69.725610,,78.0,4.0,18.0,
51191828,2015-01-01,31.064758,69.724684,,82.0,3.0,15.0,
51191829,2015-01-01,31.064758,69.723759,,97.0,3.0,,


In [24]:
# Replace NaNs by 0
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    df[col] = df[col].fillna(0)
    dg[col] = dg[col].fillna(0)

Bare
Grass
Lichen
Shrub
Tree


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dg[col] = dg[col].fillna(0)


In [25]:
# Calculate total fractional coverage of bare, grass, lichen, shrub and tree (should be 100)
df['Total']  = (df['Bare'] + df['Grass'] + df['Lichen'] + df['Shrub'] + df['Tree'])
dg['Total']  = (dg['Bare'] + dg['Grass'] + dg['Lichen'] + dg['Shrub'] + dg['Tree'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Total']  = (df['Bare'] + df['Grass'] + df['Lichen'] + df['Shrub'] + df['Tree'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dg['Total']  = (dg['Bare'] + dg['Grass'] + dg['Lichen'] + dg['Shrub'] + dg['Tree'])


In [26]:
df

Unnamed: 0,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree,Total
2520,2015-01-01,15.595314,68.854314,0.0,41.0,1.0,29.0,29.0,100.0
2823,2015-01-01,15.595314,68.573759,20.0,48.0,24.0,8.0,0.0,100.0
2826,2015-01-01,15.595314,68.570981,23.0,58.0,19.0,0.0,0.0,100.0
2840,2015-01-01,15.595314,68.558018,7.0,58.0,25.0,10.0,0.0,100.0
2885,2015-01-01,15.595314,68.516351,35.0,34.0,31.0,0.0,0.0,100.0
...,...,...,...,...,...,...,...,...,...
51191825,2015-01-01,31.064758,69.727462,0.0,62.0,23.0,15.0,0.0,100.0
51191827,2015-01-01,31.064758,69.725610,0.0,78.0,4.0,18.0,0.0,100.0
51191828,2015-01-01,31.064758,69.724684,0.0,82.0,3.0,15.0,0.0,100.0
51191829,2015-01-01,31.064758,69.723759,0.0,97.0,3.0,0.0,0.0,100.0


In [27]:
# Normalize the fractional cover
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    df[col] = df[col] / df['Total']
    dg[col] = dg[col] / dg['Total']

Bare
Grass
Lichen
Shrub
Tree


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col] / df['Total']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dg[col] = dg[col] / dg['Total']


In [28]:
# Drop the *Total* column
df = df.drop(['Total'], axis=1)
dg = dg.drop(['Total'], axis=1)

In [29]:
df

Unnamed: 0,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree
2520,2015-01-01,15.595314,68.854314,0.00,0.41,0.01,0.29,0.29
2823,2015-01-01,15.595314,68.573759,0.20,0.48,0.24,0.08,0.00
2826,2015-01-01,15.595314,68.570981,0.23,0.58,0.19,0.00,0.00
2840,2015-01-01,15.595314,68.558018,0.07,0.58,0.25,0.10,0.00
2885,2015-01-01,15.595314,68.516351,0.35,0.34,0.31,0.00,0.00
...,...,...,...,...,...,...,...,...
51191825,2015-01-01,31.064758,69.727462,0.00,0.62,0.23,0.15,0.00
51191827,2015-01-01,31.064758,69.725610,0.00,0.78,0.04,0.18,0.00
51191828,2015-01-01,31.064758,69.724684,0.00,0.82,0.03,0.15,0.00
51191829,2015-01-01,31.064758,69.723759,0.00,0.97,0.03,0.00,0.00


In [30]:
# Convert to VAEX
dvx = vaex.from_pandas(df)
dvy = vaex.from_pandas(dg)

In [31]:
dvx

#,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree
0,2015-01-01 00:00:00.000000000,15.595313502963002,68.8543140557036,0.0,0.41,0.01,0.29,0.29
1,2015-01-01 00:00:00.000000000,15.595313502963002,68.57375850014802,0.2,0.48,0.24,0.08,0.0
2,2015-01-01 00:00:00.000000000,15.595313502963002,68.57098072237025,0.23,0.58,0.19,0.0,0.0
3,2015-01-01 00:00:00.000000000,15.595313502963002,68.55801775940728,0.07,0.58,0.25,0.1,0.0
4,2015-01-01 00:00:00.000000000,15.595313502963002,68.5163510927406,0.35,0.34,0.31,0.0,0.0
...,...,...,...,...,...,...,...,...
335764,2015-01-01 00:00:00.000000000,31.064757947408683,69.72746220385181,0.0,0.62,0.23,0.15,0.0
335765,2015-01-01 00:00:00.000000000,31.064757947408683,69.72561035199996,0.0,0.78,0.04,0.18,0.0
335766,2015-01-01 00:00:00.000000000,31.064757947408683,69.72468442607405,0.0,0.82,0.03,0.15,0.0
335767,2015-01-01 00:00:00.000000000,31.064757947408683,69.72375850014812,0.0,0.97,0.03,0.0,0.0


In [32]:
# Find the correspondind ERA5-land lat-lon
# Careful with the latitude, in reverse order
dvx['ERA5_lon_index'] = ((dvx.lon - 15.59) / 0.1).astype('int').values
dvx['ERA5_lat_index'] = 28 - ((dvx.lat - 68.35) / 0.1).astype('int').values
dvy['ERA5_lon_index'] = ((dvy.lon - 15.59) / 0.1).astype('int').values
dvy['ERA5_lat_index'] = 28 - ((dvy.lat - 68.35) / 0.1).astype('int').values

In [33]:
dvx

#,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree,ERA5_lon_index,ERA5_lat_index
0,2015-01-01 00:00:00.000000000,15.595313502963002,68.8543140557036,0.0,0.41,0.01,0.29,0.29,0,23
1,2015-01-01 00:00:00.000000000,15.595313502963002,68.57375850014802,0.2,0.48,0.24,0.08,0.0,0,26
2,2015-01-01 00:00:00.000000000,15.595313502963002,68.57098072237025,0.23,0.58,0.19,0.0,0.0,0,26
3,2015-01-01 00:00:00.000000000,15.595313502963002,68.55801775940728,0.07,0.58,0.25,0.1,0.0,0,26
4,2015-01-01 00:00:00.000000000,15.595313502963002,68.5163510927406,0.35,0.34,0.31,0.0,0.0,0,27
...,...,...,...,...,...,...,...,...,...,...
335764,2015-01-01 00:00:00.000000000,31.064757947408683,69.72746220385181,0.0,0.62,0.23,0.15,0.0,154,15
335765,2015-01-01 00:00:00.000000000,31.064757947408683,69.72561035199996,0.0,0.78,0.04,0.18,0.0,154,15
335766,2015-01-01 00:00:00.000000000,31.064757947408683,69.72468442607405,0.0,0.82,0.03,0.15,0.0,154,15
335767,2015-01-01 00:00:00.000000000,31.064757947408683,69.72375850014812,0.0,0.97,0.03,0.0,0.0,154,15


# Adding columns with the ERA5-land longitude and latitude to dv

In [34]:
dvx['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvx['ERA5_lon_index'].values].values
dvx['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvx['ERA5_lat_index'].values].values
dvy['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvy['ERA5_lon_index'].values].values
dvy['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvy['ERA5_lat_index'].values].values

In [35]:
dvx

#,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree,ERA5_lon_index,ERA5_lat_index,ERA5_lon,ERA5_lat
0,2015-01-01 00:00:00.000000000,15.595313502963002,68.8543140557036,0.0,0.41,0.01,0.29,0.29,0,23,15.59,68.85
1,2015-01-01 00:00:00.000000000,15.595313502963002,68.57375850014802,0.2,0.48,0.24,0.08,0.0,0,26,15.59,68.55
2,2015-01-01 00:00:00.000000000,15.595313502963002,68.57098072237025,0.23,0.58,0.19,0.0,0.0,0,26,15.59,68.55
3,2015-01-01 00:00:00.000000000,15.595313502963002,68.55801775940728,0.07,0.58,0.25,0.1,0.0,0,26,15.59,68.55
4,2015-01-01 00:00:00.000000000,15.595313502963002,68.5163510927406,0.35,0.34,0.31,0.0,0.0,0,27,15.59,68.45
...,...,...,...,...,...,...,...,...,...,...,...,...
335764,2015-01-01 00:00:00.000000000,31.064757947408683,69.72746220385181,0.0,0.62,0.23,0.15,0.0,154,15,30.99,69.65
335765,2015-01-01 00:00:00.000000000,31.064757947408683,69.72561035199996,0.0,0.78,0.04,0.18,0.0,154,15,30.99,69.65
335766,2015-01-01 00:00:00.000000000,31.064757947408683,69.72468442607405,0.0,0.82,0.03,0.15,0.0,154,15,30.99,69.65
335767,2015-01-01 00:00:00.000000000,31.064757947408683,69.72375850014812,0.0,0.97,0.03,0.0,0.0,154,15,30.99,69.65


## Extract ERA5 data for  the selected period of the year (when RoS events mostly occur)

In [36]:
ERA5 = ERA5land.sel(time=slice(str(Year + 1) + '-01-01', str(Year + 1)  + '-12-31'))

In [37]:
ERA5 = ERA5.isel(time=range(Number_of_days * 24))

In [38]:
ERA5 = ERA5.isel(expver = 0)

In [39]:
ERA5

In [40]:
# Extract ERA5 t2m, tp and sd fields 
ERA5_t2m = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['t2m']
ERA5_tp = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['tp']
ERA5_sd = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['sd']

# Rain on Snow criteria (according to https://www.hydrol-earth-syst-sci.net/23/2983/2019/hess-23-2983-2019.pdf)
#
# total rainfall volume of at least 20 mm within 12 h
# or
# air temperatures above 0C (273.15K)
# and¶
# initial snowpack depth of at least 10 cm

In [41]:
# Normalizing temperature, total precipitation and snow depth values accordidng to these criteria
ERA5_t2m = ERA5_t2m / 273.15
ERA5_tp = ERA5_tp / 0.02 * 12.
ERA5_sd = ERA5_sd / 0.1

In [42]:
df_t2m = ERA5_t2m.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()
df_tp = ERA5_tp.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()
df_sd = ERA5_sd.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()

In [43]:
df_tp = df_tp.drop(columns=['latitude', 'longitude'])
df_sd = df_sd.drop(columns=['latitude', 'longitude'])

In [44]:
# Create labels for ERA5-land variables to replace the dates
label_t2m = ['latitude', 'longitude']
label_tp = list()
label_sd = list()
for i in range(Number_of_days * 24):
    label_t2m.append('t2m_'+ str(i))
    label_tp.append('tp_'+ str(i))
    label_sd.append('sd_'+ str(i))

In [45]:
df_t2m.set_axis(label_t2m, axis="columns", inplace=True)
df_tp.set_axis(label_tp, axis="columns", inplace=True)
df_sd.set_axis(label_sd, axis="columns", inplace=True)

In [46]:
##  Glue together df_t2m, df_tp and df_sd
df = pd.concat([df_t2m, df_tp, df_sd], axis = 1)

In [47]:
df

Unnamed: 0,latitude,longitude,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,t2m_7,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
0,71.150002,15.590000,,,,,,,,,...,,,,,,,,,,
1,71.150002,15.690000,,,,,,,,,...,,,,,,,,,,
2,71.150002,15.790000,,,,,,,,,...,,,,,,,,,,
3,71.150002,15.890000,,,,,,,,,...,,,,,,,,,,
4,71.150002,15.990000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4490,68.349998,30.590000,0.986783,0.985853,0.984817,0.983796,0.982589,0.981640,0.980055,0.980477,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
4491,68.349998,30.690001,0.986696,0.985762,0.984746,0.983745,0.982549,0.981626,0.980062,0.980548,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
4492,68.349998,30.790001,0.986581,0.985651,0.984662,0.983674,0.982492,0.981604,0.980067,0.980610,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
4493,68.349998,30.889999,0.986530,0.985611,0.984657,0.983692,0.982534,0.981686,0.980202,0.980768,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001


In [48]:
# Add combined lon_lat column to df
df['lon_lat'] = (df['longitude'] * 100).astype('int') + (df['latitude'] * 100).astype('int') / 100000

In [49]:
# Drop latitude and longitude columns which are not used anymore in df
df = df.drop(columns=['latitude', 'longitude'])

In [50]:
df

Unnamed: 0,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,t2m_7,t2m_8,t2m_9,...,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391,lon_lat
0,,,,,,,,,,,...,,,,,,,,,,1559.07115
1,,,,,,,,,,,...,,,,,,,,,,1568.07115
2,,,,,,,,,,,...,,,,,,,,,,1578.07115
3,,,,,,,,,,,...,,,,,,,,,,1589.07115
4,,,,,,,,,,,...,,,,,,,,,,1598.07115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4490,0.986783,0.985853,0.984817,0.983796,0.982589,0.981640,0.980055,0.980477,0.979683,0.979024,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,3059.06834
4491,0.986696,0.985762,0.984746,0.983745,0.982549,0.981626,0.980062,0.980548,0.979769,0.979130,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,3069.06834
4492,0.986581,0.985651,0.984662,0.983674,0.982492,0.981604,0.980067,0.980610,0.979840,0.979221,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,3079.06834
4493,0.986530,0.985611,0.984657,0.983692,0.982534,0.981686,0.980202,0.980768,0.980007,0.979407,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,3088.06834


In [51]:
# Add combined lon_lat column to dv x & y
dvx['lon_lat'] = (dvx['ERA5_lon'] * 100).astype('int') + (dvx['ERA5_lat'] * 100).astype('int') / 100000

In [52]:
dvx

#,time,lon,lat,Bare,Grass,Lichen,Shrub,Tree,ERA5_lon_index,ERA5_lat_index,ERA5_lon,ERA5_lat,lon_lat
0,2015-01-01 00:00:00.000000000,15.595313502963002,68.8543140557036,0.0,0.41,0.01,0.29,0.29,0,23,15.59,68.85,1559.06885
1,2015-01-01 00:00:00.000000000,15.595313502963002,68.57375850014802,0.2,0.48,0.24,0.08,0.0,0,26,15.59,68.55,1559.06855
2,2015-01-01 00:00:00.000000000,15.595313502963002,68.57098072237025,0.23,0.58,0.19,0.0,0.0,0,26,15.59,68.55,1559.06855
3,2015-01-01 00:00:00.000000000,15.595313502963002,68.55801775940728,0.07,0.58,0.25,0.1,0.0,0,26,15.59,68.55,1559.06855
4,2015-01-01 00:00:00.000000000,15.595313502963002,68.5163510927406,0.35,0.34,0.31,0.0,0.0,0,27,15.59,68.45,1559.06844
...,...,...,...,...,...,...,...,...,...,...,...,...,...
335764,2015-01-01 00:00:00.000000000,31.064757947408683,69.72746220385181,0.0,0.62,0.23,0.15,0.0,154,15,30.99,69.65,3099.06965
335765,2015-01-01 00:00:00.000000000,31.064757947408683,69.72561035199996,0.0,0.78,0.04,0.18,0.0,154,15,30.99,69.65,3099.06965
335766,2015-01-01 00:00:00.000000000,31.064757947408683,69.72468442607405,0.0,0.82,0.03,0.15,0.0,154,15,30.99,69.65,3099.06965
335767,2015-01-01 00:00:00.000000000,31.064757947408683,69.72375850014812,0.0,0.97,0.03,0.0,0.0,154,15,30.99,69.65,3099.06965


In [53]:
# Drop unused columns in dv x & y
dwx = dvx.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])
dwy = dvy.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])

In [54]:
dwx

#,lon,lat,Bare,Grass,Lichen,Shrub,Tree,lon_lat
0,15.595313502963002,68.8543140557036,0.0,0.41,0.01,0.29,0.29,1559.06885
1,15.595313502963002,68.57375850014802,0.2,0.48,0.24,0.08,0.0,1559.06855
2,15.595313502963002,68.57098072237025,0.23,0.58,0.19,0.0,0.0,1559.06855
3,15.595313502963002,68.55801775940728,0.07,0.58,0.25,0.1,0.0,1559.06855
4,15.595313502963002,68.5163510927406,0.35,0.34,0.31,0.0,0.0,1559.06844
...,...,...,...,...,...,...,...,...
335764,31.064757947408683,69.72746220385181,0.0,0.62,0.23,0.15,0.0,3099.06965
335765,31.064757947408683,69.72561035199996,0.0,0.78,0.04,0.18,0.0,3099.06965
335766,31.064757947408683,69.72468442607405,0.0,0.82,0.03,0.15,0.0,3099.06965
335767,31.064757947408683,69.72375850014812,0.0,0.97,0.03,0.0,0.0,3099.06965


In [55]:
# Convert to panda dw x & y
dwx_pandas = dwx.to_pandas_df()
dwy_pandas = dwy.to_pandas_df()

## Join dwx (WLC) with df (ERA5 t2m-tp-sd)

In [56]:
dwx_pandas

Unnamed: 0,lon,lat,Bare,Grass,Lichen,Shrub,Tree,lon_lat
0,15.595314,68.854314,0.00,0.41,0.01,0.29,0.29,1559.06885
1,15.595314,68.573759,0.20,0.48,0.24,0.08,0.00,1559.06855
2,15.595314,68.570981,0.23,0.58,0.19,0.00,0.00,1559.06855
3,15.595314,68.558018,0.07,0.58,0.25,0.10,0.00,1559.06855
4,15.595314,68.516351,0.35,0.34,0.31,0.00,0.00,1559.06844
...,...,...,...,...,...,...,...,...
335764,31.064758,69.727462,0.00,0.62,0.23,0.15,0.00,3099.06965
335765,31.064758,69.725610,0.00,0.78,0.04,0.18,0.00,3099.06965
335766,31.064758,69.724684,0.00,0.82,0.03,0.15,0.00,3099.06965
335767,31.064758,69.723759,0.00,0.97,0.03,0.00,0.00,3099.06965


In [57]:
# Join dwx (WLC) with df (ERA5 t2m-tp-sd)
dx = dwx_pandas.set_index('lon_lat').join(df.set_index('lon_lat'), on='lon_lat')

In [58]:
dx

Unnamed: 0_level_0,lon,lat,Bare,Grass,Lichen,Shrub,Tree,t2m_0,t2m_1,t2m_2,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
lon_lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1559.06885,15.595314,68.854314,0.00,0.41,0.01,0.29,0.29,,,,...,,,,,,,,,,
1559.06855,15.595314,68.573759,0.20,0.48,0.24,0.08,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
1559.06855,15.595314,68.570981,0.23,0.58,0.19,0.00,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
1559.06855,15.595314,68.558018,0.07,0.58,0.25,0.10,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
1559.06844,15.595314,68.516351,0.35,0.34,0.31,0.00,0.00,0.996191,0.994671,0.995452,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3099.06965,31.064758,69.727462,0.00,0.62,0.23,0.15,0.00,,,,...,,,,,,,,,,
3099.06965,31.064758,69.725610,0.00,0.78,0.04,0.18,0.00,,,,...,,,,,,,,,,
3099.06965,31.064758,69.724684,0.00,0.82,0.03,0.15,0.00,,,,...,,,,,,,,,,
3099.06965,31.064758,69.723759,0.00,0.97,0.03,0.00,0.00,,,,...,,,,,,,,,,


In [59]:
# Drop the Rows with NaN Values
dx = dx.dropna()

In [60]:
dx = dx.reset_index()

In [61]:
dx = dx.drop(columns=['lon_lat'])

In [62]:
dx

Unnamed: 0,lon,lat,Bare,Grass,Lichen,Shrub,Tree,t2m_0,t2m_1,t2m_2,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
0,15.595314,68.573759,0.20,0.48,0.24,0.08,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
1,15.595314,68.570981,0.23,0.58,0.19,0.00,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
2,15.595314,68.558018,0.07,0.58,0.25,0.10,0.00,0.989355,0.988203,0.990508,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
3,15.595314,68.516351,0.35,0.34,0.31,0.00,0.00,0.996191,0.994671,0.995452,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
4,15.595314,68.504314,0.00,0.31,0.01,0.21,0.47,0.996191,0.994671,0.995452,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139743,30.888832,69.174684,0.00,0.67,0.24,0.09,0.00,0.987300,0.986834,0.985864,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
139744,30.889758,69.711721,0.00,0.46,0.34,0.11,0.09,0.993679,0.993266,0.992661,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
139745,30.889758,69.710796,0.00,0.32,0.25,0.11,0.32,0.993679,0.993266,0.992661,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001
139746,30.889758,69.709870,0.00,0.34,0.21,0.12,0.33,0.993679,0.993266,0.992661,...,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001,-0.000001


In [66]:
## Save into **local** HDF5 file without index

import os
path = '/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/'
x_filename = os.path.join(path, 'x_tps_' + str(Year) + '.hdf')
print(x_filename)
dx.to_hdf(x_filename, key='df', mode="w", index=False)

/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/x_tps_2015.hdf


## Find locations with lichen in the following corresponding to those in current year

In [67]:
dwx_pandas = dx[['lon', 'lat']]

In [68]:
dwx_pandas

Unnamed: 0,lon,lat
0,15.595314,68.573759
1,15.595314,68.570981
2,15.595314,68.558018
3,15.595314,68.516351
4,15.595314,68.504314
...,...,...
139743,30.888832,69.174684
139744,30.889758,69.711721
139745,30.889758,69.710796
139746,30.889758,69.709870


In [69]:
dwy_pandas

Unnamed: 0,lon,lat,Bare,Grass,Lichen,Shrub,Tree
0,15.595314,68.855240,0.000000,0.590000,0.010000,0.200000,0.200000
1,15.595314,68.854314,0.000000,0.430000,0.030000,0.270000,0.270000
2,15.595314,68.853388,0.000000,0.410000,0.010000,0.230000,0.350000
3,15.595314,68.770981,0.090000,0.630000,0.210000,0.070000,0.000000
4,15.595314,68.713573,0.102041,0.602041,0.183673,0.112245,0.000000
...,...,...,...,...,...,...,...
504256,31.064758,69.722833,0.000000,0.430000,0.350000,0.200000,0.020000
504257,31.064758,69.721907,0.000000,0.560000,0.260000,0.180000,0.000000
504258,31.064758,69.720981,0.000000,0.510000,0.370000,0.120000,0.000000
504259,31.064758,69.720055,0.090000,0.440000,0.360000,0.110000,0.000000


In [70]:
# Add combined lat-lon column to dv x & y
dwx_pandas['lon_lat'] = (dwx_pandas['lon'] * 100000).astype('int') + (dwx_pandas['lat'] * 100000).astype('int') / 10000000
dwy_pandas['lon_lat'] = (dwy_pandas['lon'] * 100000).astype('int') + (dwy_pandas['lat'] * 100000).astype('int') / 10000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dwx_pandas['lon_lat'] = (dwx_pandas['lon'] * 100000).astype('int') + (dwx_pandas['lat'] * 100000).astype('int') / 10000000


In [71]:
dwx_pandas = dwx_pandas.drop(columns=['lon', 'lat'])
dwy_pandas = dwy_pandas.drop(columns=['lon', 'lat'])

In [72]:
dwx_pandas

Unnamed: 0,lon_lat
0,1.559532e+06
1,1.559532e+06
2,1.559532e+06
3,1.559532e+06
4,1.559532e+06
...,...
139743,3.088884e+06
139744,3.088976e+06
139745,3.088976e+06
139746,3.088976e+06


In [73]:
dwy_pandas

Unnamed: 0,Bare,Grass,Lichen,Shrub,Tree,lon_lat
0,0.000000,0.590000,0.010000,0.200000,0.200000,1.559532e+06
1,0.000000,0.430000,0.030000,0.270000,0.270000,1.559532e+06
2,0.000000,0.410000,0.010000,0.230000,0.350000,1.559532e+06
3,0.090000,0.630000,0.210000,0.070000,0.000000,1.559532e+06
4,0.102041,0.602041,0.183673,0.112245,0.000000,1.559532e+06
...,...,...,...,...,...,...
504256,0.000000,0.430000,0.350000,0.200000,0.020000,3.106476e+06
504257,0.000000,0.560000,0.260000,0.180000,0.000000,3.106476e+06
504258,0.000000,0.510000,0.370000,0.120000,0.000000,3.106476e+06
504259,0.090000,0.440000,0.360000,0.110000,0.000000,3.106476e+06


In [74]:
## Join dwx with dwy
dy = dwx_pandas.set_index('lon_lat').join(dwy_pandas.set_index('lon_lat'), on='lon_lat')

In [75]:
dy

Unnamed: 0_level_0,Bare,Grass,Lichen,Shrub,Tree
lon_lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.559532e+06,,,,,
1.559532e+06,,,,,
1.559532e+06,,,,,
1.559532e+06,,,,,
1.559532e+06,0.0,0.34,0.01,0.22,0.43
...,...,...,...,...,...
3.088884e+06,,,,,
3.088976e+06,0.0,0.47,0.28,0.14,0.11
3.088976e+06,0.0,0.33,0.29,0.19,0.19
3.088976e+06,0.0,0.33,0.31,0.13,0.23


In [76]:
# Replace NaNs by 0
for col in ['Bare', 'Grass', 'Lichen', 'Shrub', 'Tree']:
    print(col)
    dy[col] = dy[col].fillna(0)

Bare
Grass
Lichen
Shrub
Tree


In [77]:
dy = dy.reset_index().drop(columns=['lon_lat'])

In [78]:
dy

Unnamed: 0,Bare,Grass,Lichen,Shrub,Tree
0,0.0,0.00,0.00,0.00,0.00
1,0.0,0.00,0.00,0.00,0.00
2,0.0,0.00,0.00,0.00,0.00
3,0.0,0.00,0.00,0.00,0.00
4,0.0,0.34,0.01,0.22,0.43
...,...,...,...,...,...
139743,0.0,0.00,0.00,0.00,0.00
139744,0.0,0.47,0.28,0.14,0.11
139745,0.0,0.33,0.29,0.19,0.19
139746,0.0,0.33,0.31,0.13,0.23


In [79]:
## Save into **local** HDF5 file without index
y_filename = os.path.join(path, 'y_' + str(Year) + '.hdf')
print(y_filename)
dy.to_hdf(y_filename, key='dg', mode="w", index=False)

/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/y_2015.hdf


In [80]:
print('Finished!')

Finished!
