# Prepare labelled input for the Machine Learning algorithm
# (i.e. locations where moss&lichen fractional cover changes can be related to meteorological parameters from ERA5-Land)

# Copernicus Global Land Cover
## Percentage of 100m pixel that is covered by a specific class of land cover
## Valid values 0-100, 200 = masked sea, 255 = missing

In [1]:
!date

Mon Mar 13 16:39:50 UTC 2023


In [2]:
pip install vaex

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import os
import pandas as pd
import s3fs
import xarray as xr
import vaex

# Input datasets, either from s3 storage or local files

# If data is available locally then jump the following cells

### Define s3 storage parameters

In [None]:
client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'}
store = s3fs.S3FileSystem(anon=False, client_kwargs=client_kwargs)
store.ls('Data', detail=True, refresh=True)

## Copernicus Global Land Cover data  from 2015-01-01 to 2019-12-31 already available as a netCDF file stored on EOSC (CESNET)
## Troms og Finnmark
### Mosses and lichens, bare, grass, shrubs and trees

In [None]:
s3path = 'Data/C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc'

In [None]:
GLC_AOI = xr.open_dataset(store.open(s3path))

## ERA5-land data from 2015-01-01 to 2019-12-31 - already available as a netCDF file stored on EOSC (CESNET)
## 2m Temperature, Snow depth, Total precipitation

In [None]:
s3path = 'Data/reanalysis-era5-land_hourly_2015-01-01_2022-12-31_Troms-Finnmark_T2m-SD-TP.nc'

In [None]:
ERA5land = xr.open_dataset(store.open(s3path))

In [None]:
ERA5land

# Datasets from **local** files

In [4]:
path = '/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/'

In [5]:
# World Land cover data from 2015-01-01 to 2019-12-31- already available as a netCDF file stored locally
GLC_filename = os.path.join(path, 'C_GlobalLandCover_20150101_20190101_Troms-Finnmark.nc')
GLC_AOI = xr.open_dataset(GLC_filename, engine = 'netcdf4')

In [6]:
# ERA5-land data already available as a netCDF file stored locally
ERA5_filename = os.path.join(path, 'reanalysis-era5-land_hourly_2015-01-01_2022-12-31_Troms-Finnmark_T2m-SD-TP.nc')
ERA5land = xr.open_dataset(ERA5_filename, engine = 'netcdf4')

In [7]:
GLC_AOI = GLC_AOI.rename(x='lon', y='lat', t='time')

### The two cells below redefine a very small region for testing purposes only - skip them to keep the whole Troms-finnmark area

In [8]:
# Small region 
AOI_min_lon = 19.61
AOI_max_lon = 19.6185
AOI_min_lat = 69.04
AOI_max_lat = 69.049

In [9]:
GLC_AOI = GLC_AOI.sel(lat=slice(AOI_max_lat, AOI_min_lat), lon=slice(AOI_min_lon, AOI_max_lon))

In [10]:
GLC_AOI

In [11]:
# Drop variables not directly of interest here
GLC_AOI = GLC_AOI.drop_vars(['crs',
                             'Bare_CoverFraction_layer',
                             'Crops_CoverFraction_layer',
                             'Grass_CoverFraction_layer',
                             'Discrete_Classification_map', 
                             'Discrete_Classification_proba',
                             'Forest_Type_layer',
                             'Shrub_CoverFraction_layer',
                             'Snow_CoverFraction_layer',
                             'Tree_CoverFraction_layer',
                             'BuiltUp_CoverFraction_layer',
                             'PermanentWater_CoverFraction_layer',
                             'SeasonalWater_CoverFraction_layer',
                             'DataDensityIndicator',
                             'Change_Confidence_layer',
                             'dataMask'])

In [12]:
GLC_AOI = GLC_AOI.rename(MossLichen_CoverFraction_layer = 'Lichen')

In [13]:
GLC_AOI

In [14]:
# Troms & Finnmark Global Land Cover area
GLC_AOI_min_lon = GLC_AOI.lon.min()
GLC_AOI_max_lon = GLC_AOI.lon.max()
GLC_AOI_min_lat = GLC_AOI.lat.min()
GLC_AOI_max_lat = GLC_AOI.lat.max()
print(GLC_AOI_min_lon.values, GLC_AOI_max_lon.values, GLC_AOI_min_lat.values, GLC_AOI_max_lat.values)

19.610128317778138 19.618461651111474 69.04042516681473 69.04875850014807


In [15]:
mask = GLC_AOI['Lichen'].where(GLC_AOI['Lichen'] <= 100)

In [16]:
mask = xr.where(mask > 0, 1, 0)

In [17]:
mask = mask.sum(dim = 'time', min_count = 5, skipna=True)

In [18]:
mask = xr.where(mask == 5, 1, 0)

## Use the mask to only keep pixels with lichen every year

In [20]:
de = GLC_AOI.where(mask == 1)

In [21]:
de

In [22]:
de = de.to_dataframe()

In [23]:
de

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Lichen
time,lat,lon,Unnamed: 3_level_1
2015-01-01,69.048759,19.610128,
2015-01-01,69.048759,19.611054,
2015-01-01,69.048759,19.611980,
2015-01-01,69.048759,19.612906,
2015-01-01,69.048759,19.613832,
...,...,...,...
2019-01-01,69.040425,19.614758,
2019-01-01,69.040425,19.615684,
2019-01-01,69.040425,19.616610,
2019-01-01,69.040425,19.617536,


In [24]:
# Drop the Rows with NaN Values
de = de.dropna()

In [25]:
de = de.reset_index()

In [26]:
de

Unnamed: 0,time,lat,lon,Lichen
0,2015-01-01,69.046907,19.610128,14.0
1,2015-01-01,69.046907,19.611054,14.0
2,2015-01-01,69.046907,19.61198,14.0
3,2015-01-01,69.045981,19.610128,2.0
4,2015-01-01,69.045981,19.611054,2.0
5,2015-01-01,69.045981,19.61198,2.0
6,2015-01-01,69.045981,19.614758,20.0
7,2015-01-01,69.045981,19.615684,20.0
8,2015-01-01,69.045981,19.61661,20.0
9,2015-01-01,69.045055,19.617536,21.0


## Each year in a separate dataset and keep only the first 183 days

In [262]:
Year = 2019
Number_of_days = 183
print('x = WLC(' + str(Year)+ ') joined with ERA5land(' + str(Year + 1) + ')')
print('y = WLC(' + str(Year + 1) + ')')

x = WLC(2019) joined with ERA5land(2020)
y = WLC(2020)


In [263]:
# Only keep locations with lichen for the current year
df = de.loc[de['time'] == str(Year) + '-01-01']
dg = de.loc[de['time'] == str(Year + 1) + '-01-01']

In [264]:
df

Unnamed: 0,time,lat,lon,Lichen
44,2019-01-01,69.046907,19.610128,4.0
45,2019-01-01,69.046907,19.611054,4.0
46,2019-01-01,69.046907,19.61198,4.0
47,2019-01-01,69.045981,19.610128,3.0
48,2019-01-01,69.045981,19.611054,3.0
49,2019-01-01,69.045981,19.61198,3.0
50,2019-01-01,69.045981,19.614758,13.0
51,2019-01-01,69.045981,19.615684,13.0
52,2019-01-01,69.045981,19.61661,13.0
53,2019-01-01,69.045055,19.617536,11.0


In [265]:
dg

Unnamed: 0,time,lat,lon,Lichen


In [266]:
# Normalize the fractional cover
df['Lichen'] = df['Lichen'].div(100)
dg['Lichen'] = dg['Lichen'].div(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lichen'] = df['Lichen'].div(100)


In [267]:
df

Unnamed: 0,time,lat,lon,Lichen
44,2019-01-01,69.046907,19.610128,0.04
45,2019-01-01,69.046907,19.611054,0.04
46,2019-01-01,69.046907,19.61198,0.04
47,2019-01-01,69.045981,19.610128,0.03
48,2019-01-01,69.045981,19.611054,0.03
49,2019-01-01,69.045981,19.61198,0.03
50,2019-01-01,69.045981,19.614758,0.13
51,2019-01-01,69.045981,19.615684,0.13
52,2019-01-01,69.045981,19.61661,0.13
53,2019-01-01,69.045055,19.617536,0.11


In [268]:
dg

Unnamed: 0,time,lat,lon,Lichen


In [269]:
# Convert to VAEX
dvx = vaex.from_pandas(df)
dvy = vaex.from_pandas(dg)

In [270]:
dvx

#,time,lat,lon,Lichen
0,2019-01-01 00:00:00.000000000,69.04690664829621,19.610128317778138,0.04
1,2019-01-01 00:00:00.000000000,69.04690664829621,19.611054243704064,0.04
2,2019-01-01 00:00:00.000000000,69.04690664829621,19.61198016962999,0.04
3,2019-01-01 00:00:00.000000000,69.04598072237029,19.610128317778138,0.03
4,2019-01-01 00:00:00.000000000,69.04598072237029,19.611054243704064,0.03
...,...,...,...,...
6,2019-01-01 00:00:00.000000000,69.04598072237029,19.614757947407767,0.13
7,2019-01-01 00:00:00.000000000,69.04598072237029,19.615683873333694,0.13
8,2019-01-01 00:00:00.000000000,69.04598072237029,19.61660979925962,0.13
9,2019-01-01 00:00:00.000000000,69.04505479644436,19.617535725185547,0.11


In [271]:
dvy

#,time,lat,lon,Lichen
,,,,


In [272]:
# Find the correspondind ERA5-land lat-lon
# Careful with the latitude, in reverse order
dvx['ERA5_lon_index'] = ((dvx.lon - 15.59) / 0.1).astype('int').values
dvx['ERA5_lat_index'] = 28 - ((dvx.lat - 68.35) / 0.1).astype('int').values
dvy['ERA5_lon_index'] = ((dvy.lon - 15.59) / 0.1).astype('int').values
dvy['ERA5_lat_index'] = 28 - ((dvy.lat - 68.35) / 0.1).astype('int').values

In [273]:
dvx

#,time,lat,lon,Lichen,ERA5_lon_index,ERA5_lat_index
0,2019-01-01 00:00:00.000000000,69.04690664829621,19.610128317778138,0.04,40,22
1,2019-01-01 00:00:00.000000000,69.04690664829621,19.611054243704064,0.04,40,22
2,2019-01-01 00:00:00.000000000,69.04690664829621,19.61198016962999,0.04,40,22
3,2019-01-01 00:00:00.000000000,69.04598072237029,19.610128317778138,0.03,40,22
4,2019-01-01 00:00:00.000000000,69.04598072237029,19.611054243704064,0.03,40,22
...,...,...,...,...,...,...
6,2019-01-01 00:00:00.000000000,69.04598072237029,19.614757947407767,0.13,40,22
7,2019-01-01 00:00:00.000000000,69.04598072237029,19.615683873333694,0.13,40,22
8,2019-01-01 00:00:00.000000000,69.04598072237029,19.61660979925962,0.13,40,22
9,2019-01-01 00:00:00.000000000,69.04505479644436,19.617535725185547,0.11,40,22


# Adding columns with the ERA5-land longitude and latitude to dv

In [274]:
dvx['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvx['ERA5_lon_index'].values].values
dvx['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvx['ERA5_lat_index'].values].values
dvy['ERA5_lon'] = ERA5land.sel(time="2015-01-01").longitude[dvy['ERA5_lon_index'].values].values
dvy['ERA5_lat'] = ERA5land.sel(time="2015-01-01").latitude[dvy['ERA5_lat_index'].values].values

In [275]:
dvx

#,time,lat,lon,Lichen,ERA5_lon_index,ERA5_lat_index,ERA5_lon,ERA5_lat
0,2019-01-01 00:00:00.000000000,69.04690664829621,19.610128317778138,0.04,40,22,19.59,68.95
1,2019-01-01 00:00:00.000000000,69.04690664829621,19.611054243704064,0.04,40,22,19.59,68.95
2,2019-01-01 00:00:00.000000000,69.04690664829621,19.61198016962999,0.04,40,22,19.59,68.95
3,2019-01-01 00:00:00.000000000,69.04598072237029,19.610128317778138,0.03,40,22,19.59,68.95
4,2019-01-01 00:00:00.000000000,69.04598072237029,19.611054243704064,0.03,40,22,19.59,68.95
...,...,...,...,...,...,...,...,...
6,2019-01-01 00:00:00.000000000,69.04598072237029,19.614757947407767,0.13,40,22,19.59,68.95
7,2019-01-01 00:00:00.000000000,69.04598072237029,19.615683873333694,0.13,40,22,19.59,68.95
8,2019-01-01 00:00:00.000000000,69.04598072237029,19.61660979925962,0.13,40,22,19.59,68.95
9,2019-01-01 00:00:00.000000000,69.04505479644436,19.617535725185547,0.11,40,22,19.59,68.95


## Extract ERA5 data for  the selected period of the year (when RoS events mostly occur)

In [276]:
ERA5 = ERA5land.sel(time=slice(str(Year + 1) + '-01-01', str(Year + 1)  + '-12-31'))

In [277]:
ERA5 = ERA5.isel(time=range(Number_of_days * 24))

In [278]:
ERA5 = ERA5.isel(expver = 0)

In [279]:
ERA5

In [280]:
# Extract ERA5 t2m, tp and sd fields 
ERA5_t2m = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['t2m']
ERA5_tp = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['tp']
ERA5_sd = ERA5.where(ERA5['latitude'].isin(dvx['ERA5_lat'].values) & ERA5['longitude'].isin(dvx['ERA5_lon'].values))['sd']

## Rain on Snow criteria (according to https://www.hydrol-earth-syst-sci.net/23/2983/2019/hess-23-2983-2019.pdf)
 * total rainfall volume of at least 20 mm within 12 h
### or 
 * air temperatures above 0C (273.15K)
 * and initial snowpack depth of at least 10 cm

In [281]:
# Normalizing temperature, total precipitation and snow depth values accordidng to these criteria
ERA5_t2m = ERA5_t2m / 273.15
ERA5_tp = ERA5_tp / 0.02 * 12.
ERA5_sd = ERA5_sd / 0.1

In [282]:
df_t2m = ERA5_t2m.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()
df_tp = ERA5_tp.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()
df_sd = ERA5_sd.stack(z=['latitude', 'longitude']).to_pandas().transpose().reset_index()

In [283]:
df_tp = df_tp.drop(columns=['latitude', 'longitude'])
df_sd = df_sd.drop(columns=['latitude', 'longitude'])

In [284]:
# Create labels for ERA5-land variables to replace the dates
label_t2m = ['latitude', 'longitude']
label_tp = list()
label_sd = list()
for i in range(Number_of_days * 24):
    label_t2m.append('t2m_'+ str(i))
    label_tp.append('tp_'+ str(i))
    label_sd.append('sd_'+ str(i))

In [285]:
df_t2m.set_axis(label_t2m, axis="columns", inplace=True)
df_tp.set_axis(label_tp, axis="columns", inplace=True)
df_sd.set_axis(label_sd, axis="columns", inplace=True)

In [286]:
##  Glue together df_t2m, df_tp and df_sd
df = pd.concat([df_t2m, df_tp, df_sd], axis = 1)

In [287]:
df

Unnamed: 0,latitude,longitude,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,t2m_7,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
0,71.150002,15.590000,,,,,,,,,...,,,,,,,,,,
1,71.150002,15.690000,,,,,,,,,...,,,,,,,,,,
2,71.150002,15.790000,,,,,,,,,...,,,,,,,,,,
3,71.150002,15.890000,,,,,,,,,...,,,,,,,,,,
4,71.150002,15.990000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4490,68.349998,30.590000,,,,,,,,,...,,,,,,,,,,
4491,68.349998,30.690001,,,,,,,,,...,,,,,,,,,,
4492,68.349998,30.790001,,,,,,,,,...,,,,,,,,,,
4493,68.349998,30.889999,,,,,,,,,...,,,,,,,,,,


In [288]:
# Add combined lon_lat column to df
df['lon_lat'] = (df['longitude'] * 100).astype('int') + (df['latitude'] * 100).astype('int') / 100000

In [289]:
# Drop latitude and longitude columns which are not used anymore in df
df = df.drop(columns=['latitude', 'longitude'])

In [290]:
df

Unnamed: 0,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,t2m_7,t2m_8,t2m_9,...,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391,lon_lat
0,,,,,,,,,,,...,,,,,,,,,,1559.07115
1,,,,,,,,,,,...,,,,,,,,,,1568.07115
2,,,,,,,,,,,...,,,,,,,,,,1578.07115
3,,,,,,,,,,,...,,,,,,,,,,1589.07115
4,,,,,,,,,,,...,,,,,,,,,,1598.07115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4490,,,,,,,,,,,...,,,,,,,,,,3059.06834
4491,,,,,,,,,,,...,,,,,,,,,,3069.06834
4492,,,,,,,,,,,...,,,,,,,,,,3079.06834
4493,,,,,,,,,,,...,,,,,,,,,,3088.06834


In [291]:
# Add combined lon_lat column to dv x & y
dvx['lon_lat'] = (dvx['ERA5_lon'] * 100).astype('int') + (dvx['ERA5_lat'] * 100).astype('int') / 100000

In [292]:
dvx

#,time,lat,lon,Lichen,ERA5_lon_index,ERA5_lat_index,ERA5_lon,ERA5_lat,lon_lat
0,2019-01-01 00:00:00.000000000,69.04690664829621,19.610128317778138,0.04,40,22,19.59,68.95,1959.06894
1,2019-01-01 00:00:00.000000000,69.04690664829621,19.611054243704064,0.04,40,22,19.59,68.95,1959.06894
2,2019-01-01 00:00:00.000000000,69.04690664829621,19.61198016962999,0.04,40,22,19.59,68.95,1959.06894
3,2019-01-01 00:00:00.000000000,69.04598072237029,19.610128317778138,0.03,40,22,19.59,68.95,1959.06894
4,2019-01-01 00:00:00.000000000,69.04598072237029,19.611054243704064,0.03,40,22,19.59,68.95,1959.06894
...,...,...,...,...,...,...,...,...,...
6,2019-01-01 00:00:00.000000000,69.04598072237029,19.614757947407767,0.13,40,22,19.59,68.95,1959.06894
7,2019-01-01 00:00:00.000000000,69.04598072237029,19.615683873333694,0.13,40,22,19.59,68.95,1959.06894
8,2019-01-01 00:00:00.000000000,69.04598072237029,19.61660979925962,0.13,40,22,19.59,68.95,1959.06894
9,2019-01-01 00:00:00.000000000,69.04505479644436,19.617535725185547,0.11,40,22,19.59,68.95,1959.06894


In [293]:
# Drop unused columns in dv x & y
dwx = dvx.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])
dwy = dvy.drop(columns=['time', 'ERA5_lon_index', 'ERA5_lat_index', 'ERA5_lon', 'ERA5_lat'])

In [294]:
dwx

#,lat,lon,Lichen,lon_lat
0,69.04690664829621,19.610128317778138,0.04,1959.06894
1,69.04690664829621,19.611054243704064,0.04,1959.06894
2,69.04690664829621,19.61198016962999,0.04,1959.06894
3,69.04598072237029,19.610128317778138,0.03,1959.06894
4,69.04598072237029,19.611054243704064,0.03,1959.06894
...,...,...,...,...
6,69.04598072237029,19.614757947407767,0.13,1959.06894
7,69.04598072237029,19.615683873333694,0.13,1959.06894
8,69.04598072237029,19.61660979925962,0.13,1959.06894
9,69.04505479644436,19.617535725185547,0.11,1959.06894


In [295]:
dwy

#,lat,lon,Lichen
,,,


In [296]:
# Convert to panda dw x & y
dwx_pandas = dwx.to_pandas_df()
dwy_pandas = dwy.to_pandas_df()

## Join dwx (WLC) with df (ERA5 t2m-tp-sd)

In [297]:
dwx_pandas

Unnamed: 0,lat,lon,Lichen,lon_lat
0,69.046907,19.610128,0.04,1959.06894
1,69.046907,19.611054,0.04,1959.06894
2,69.046907,19.61198,0.04,1959.06894
3,69.045981,19.610128,0.03,1959.06894
4,69.045981,19.611054,0.03,1959.06894
5,69.045981,19.61198,0.03,1959.06894
6,69.045981,19.614758,0.13,1959.06894
7,69.045981,19.615684,0.13,1959.06894
8,69.045981,19.61661,0.13,1959.06894
9,69.045055,19.617536,0.11,1959.06894


In [298]:
# Join dwx (WLC) with df (ERA5 t2m-tp-sd)
dx = dwx_pandas.set_index('lon_lat').join(df.set_index('lon_lat'), on='lon_lat')

In [299]:
dx

Unnamed: 0_level_0,lat,lon,Lichen,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
lon_lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959.06894,69.046907,19.610128,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.046907,19.611054,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.046907,19.61198,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.610128,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.611054,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.61198,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.614758,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.615684,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045981,19.61661,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1959.06894,69.045055,19.617536,0.11,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739


In [300]:
# Drop the Rows with NaN Values
dx = dx.dropna()

In [301]:
dx = dx.reset_index()

In [302]:
dx = dx.drop(columns=['lon_lat'])

In [303]:
dx

Unnamed: 0,lat,lon,Lichen,t2m_0,t2m_1,t2m_2,t2m_3,t2m_4,t2m_5,t2m_6,...,sd_4382,sd_4383,sd_4384,sd_4385,sd_4386,sd_4387,sd_4388,sd_4389,sd_4390,sd_4391
0,69.046907,19.610128,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
1,69.046907,19.611054,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
2,69.046907,19.61198,0.04,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
3,69.045981,19.610128,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
4,69.045981,19.611054,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
5,69.045981,19.61198,0.03,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
6,69.045981,19.614758,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
7,69.045981,19.615684,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
8,69.045981,19.61661,0.13,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739
9,69.045055,19.617536,0.11,0.978251,0.977996,0.975853,0.970898,0.965369,0.965886,0.968435,...,0.005791,0.005791,0.005265,0.005265,0.005265,0.005265,0.005265,0.004739,0.004739,0.004739


In [304]:
## Save into **local** HDF5 file without index
x_filename = os.path.join(path, 'x_tps_' + str(Year) + '.hdf')
print(x_filename)
dx.to_hdf(x_filename, key='df', mode="w", index=False)

/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/x_tps_2019.hdf


## Find locations with lichen in the following year corresponding to those in current year

In [305]:
dwx_pandas = dx[['lon', 'lat']]

In [306]:
dwx_pandas

Unnamed: 0,lon,lat
0,19.610128,69.046907
1,19.611054,69.046907
2,19.61198,69.046907
3,19.610128,69.045981
4,19.611054,69.045981
5,19.61198,69.045981
6,19.614758,69.045981
7,19.615684,69.045981
8,19.61661,69.045981
9,19.617536,69.045055


In [307]:
dwy_pandas

Unnamed: 0,lat,lon,Lichen


In [308]:
# Add combined lat-lon column to dv x & y
dwx_pandas['lon_lat'] = (dwx_pandas['lon'] * 100000).astype('int') + (dwx_pandas['lat'] * 100000).astype('int') / 10000000
dwy_pandas['lon_lat'] = (dwy_pandas['lon'] * 100000).astype('int') + (dwy_pandas['lat'] * 100000).astype('int') / 10000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dwx_pandas['lon_lat'] = (dwx_pandas['lon'] * 100000).astype('int') + (dwx_pandas['lat'] * 100000).astype('int') / 10000000


In [309]:
dwx_pandas = dwx_pandas.drop(columns=['lon', 'lat'])
dwy_pandas = dwy_pandas.drop(columns=['lon', 'lat'])

In [310]:
dwx_pandas

Unnamed: 0,lon_lat
0,1961013.0
1,1961106.0
2,1961199.0
3,1961013.0
4,1961106.0
5,1961199.0
6,1961476.0
7,1961569.0
8,1961661.0
9,1961754.0


In [311]:
dwy_pandas

Unnamed: 0,Lichen,lon_lat


In [312]:
## Join dwx with dwy
dy = dwx_pandas.set_index('lon_lat').join(dwy_pandas.set_index('lon_lat'), on='lon_lat')

In [313]:
dy

Unnamed: 0_level_0,Lichen
lon_lat,Unnamed: 1_level_1
1961013.0,
1961106.0,
1961199.0,
1961013.0,
1961106.0,
1961199.0,
1961476.0,
1961569.0,
1961661.0,
1961754.0,


In [314]:
dy = dy.reset_index().drop(columns=['lon_lat'])

In [315]:
dy.rename(columns = {'Lichen' : 'new_Lichen'}, inplace = True)

In [316]:
dy

Unnamed: 0,new_Lichen
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [317]:
## Save into **local** HDF5 file without index
y_filename = os.path.join(path, 'y_' + str(Year) + '.hdf')
print(y_filename)
dy.to_hdf(y_filename, key='dg', mode="w", index=False)

/home/jovyan/Arctic/Vegetation_in_Troms_and_Finnmark/data/y_2019.hdf


In [318]:
print('Finished!')

Finished!
