In [None]:
import sys, os
cwd=os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, cwd)
import utils.s3_utils as s3
import pandas as pd
import shapefile as shp
from shapely.geometry import Point
from shapely.geometry.polygon import Point, Polygon
import geopandas as gpd
import re
import matplotlib.pyplot as plt
import rioxarray as rxr
import xarray as xr
from scipy.spatial.distance import cdist
import folium
import utils.processing as pr
import os
import rasterio
pd.set_option('display.max_columns', None)
import datetime

label_dict = {
    'fg' : 'windSpeed',
    'tg' : 'meanTemp',
    'tn' : 'minTemp',
    'tx' : 'maxTemp',
    'rr' : 'Precip',
    'hu' : 'relHumidity',
    'qq' : 'meanRadiation'
}


### TG- Daily Avg Temperature

In [None]:
#avg yearly mean temp
agg_df=[]
data_col = 'tg'
aggtype = 'yearly-mean'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 

    #get mean annual temp (1 feature)
    df =  main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time',aggtype]
    
    agg_df.append(df)
    
df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

del df, agg_df

In [None]:
#june mean temp
aggtype = 'june-mean'
agg_df=[]
data_col = 'tg'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 

    #get mean june temp
    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df[df.time.dt.month==1].groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time',aggtype]

    agg_df.append(df)
    
df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#proportion of days in a month with mean temperature > 5 celsius

aggtype = 'days-above-5degC-monthly-ratio'
agg_df=[]
data_col = 'tg'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 

    #df1 is the temp dataset counting # of days >5Celsius - this includes days/values that we have data for, just temp !>5C
    df1 = main_df.copy()
    df1[data_col] = df1.apply(lambda row: 1 if row[data_col] > 5 else 0, axis=1)
    df1 = df1.groupby(by=['longitude', 'latitude', pd.Grouper(key='time',freq="M")]).sum().reset_index()#.drop('index',axis=1)

    #df2 is the temp dataset counting total # of days in month for all available data.
    df2 = main_df.groupby(by=['longitude', 'latitude', pd.Grouper(key='time',freq="M")]).count().reset_index()#.drop('index',axis=1)

    #merge two datasets to get 
    df = df1.merge(df2, on = ['latitude','longitude','time'], how = 'inner')
    df[data_col] = df['tg_x']/df['tg_y']
    df = df.drop(['tg_x','tg_y'],axis=1)

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#mean temp of driest quarter

aggtype = 'mean-temp-driest-qtr'
agg_df=[]
data_col = 'tg'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    #df1 is temp dataset --> mean temp by quarter
    df1 = main_df.copy()
    df1 = df1.groupby(by=['longitude', 'latitude', pd.Grouper(key='time',freq="QS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df1.columns = ['longitude','latitude','time', 'tg']

    #df2 is precipitation dataset
    data_col = 'rr'
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    df2 = pd.read_parquet(weather_filename, engine='pyarrow') 

    #group by quarter and find sum of precipitation, to merge onto temp quarterly means. 
    df2 = df2.groupby(by=['longitude', 'latitude', pd.Grouper(key='time',freq="QS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df2['yr'] = df2['time'].apply(lambda x: x.year+1 if x.month == 12 else x.year)
    df2.columns = ['longitude','latitude','time', data_col, 'yr']

    #group by years and then take smallest value in group, then use indices on df2 to get lowest precip quarters
    df2 = df2.iloc[df2.groupby(by=['longitude', 'latitude', 'yr'])['rr'].nsmallest(1).reset_index()['level_3']] #gets the indices from groupby multiindex object

    #merge datasets
    df = df1.merge(df2, how='right', on=['latitude','longitude','time'])
    df = df.drop(['rr','yr'],axis=1)
    df.columns = ['longitude','latitude','time',aggtype]

    agg_df.append(df)

data_col = 'tg'
df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


### TX - Daily Max Temp

In [None]:
# #monthly average max temp - take max monthly temp, then average across full year.

aggtype = 'monthly-max'
agg_df=[]
data_col = 'tx'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 

    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['max']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]
    
    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#total warm springs days per year

aggtype = 'spring-days-above-40f'
agg_df=[]
data_col = 'tx'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 

    #first select a conditional for max temp >40 and months in 1,2,3,4. Flag those rows, and if they don't qualify then mark them as 0 so they don't get removed 
    df = main_df[main_df['time'].dt.month.isin([1,2,3,4])]
    df['tx'] = df.apply(lambda row: 1 if row[data_col] > pr.fToC(40) else 0, axis=1)

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#total hot summer days per year

aggtype = 'summer-days-above-85f'
agg_df=[]
data_col = 'tx'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    
    #select "summer months" (6,7,8), then apply conditional if maxtemp >85 then 1 else 0. Then groupby year

    df = main_df[main_df['time'].dt.month.isin([6,7,8])]
    df['tx'] = df.apply(lambda row: 1 if row[data_col] > pr.fToC(85) else 0, axis=1)

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


### TN - Daily Min Temp

In [None]:
# #monthly average min temp - take min monthly temp, then average across full year.

aggtype = 'monthly-min'
agg_df=[]
data_col = 'tn'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    
    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['min']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#select "min temp of december" month =12, then get minimum temp in december. 

aggtype = 'dec-min'
agg_df=[]
data_col = 'tn'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    
    df = main_df[main_df['time'].dt.month.isin([12])]
    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['min']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


### Temperature ranges

In [None]:
#del main_df

aggtype = 'range-annual-mean'
agg_df=[]
for year in range(2000,2023):
#pull in max temp data from local
    print(year)
    data_col = 'tx'
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    dfmax = pd.read_parquet(weather_filename, engine='pyarrow') # filter clause, filter = [("tn", ">", 12)])

    #pull in min temp data from local
    data_col = 'tn'
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    dfmin = pd.read_parquet(weather_filename, engine='pyarrow') # filter clause, filter = [("tn", ">", 12)])

    #create merged temp range df for later use

    #inner merge onto smaller tmin dataframe by lat, long, time
    merged = dfmin.merge(dfmax,how='inner', on=['latitude','longitude','time'])
    del dfmin, dfmax

    #inner merge by lat,long,time. 
    data_col = 'tn-tx'
    merged[data_col] = merged['tx'] - merged['tn']

    #subtract min and max columns to get daily temp range
    merged[data_col] = merged['tx'] - merged['tn']

    #I. mean annual temperature range 
    # #set generic main dataframe for temp

    #get annual mean temp range
    df =  merged.groupby([merged.longitude, merged.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
#del main_df
#Monthly average of daily temperature variation
#take daily range temp then average by month, then by year.

aggtype = 'daily-temp-range-monthly-mean'
agg_df=[]
for year in range(2000,2023):
    print(year)
    #pull in max temp data from local
    data_col = 'tx'
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    dfmax = pd.read_parquet(weather_filename, engine='pyarrow') # filter clause, filter = [("tn", ">", 12)])

    #pull in min temp data from local
    data_col = 'tn'
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    dfmin = pd.read_parquet(weather_filename, engine='pyarrow') # filter clause, filter = [("tn", ">", 12)])

    #create merged temp range df for later use

    #define data_col for merged def
    data_col = 'tn-tx'

    #inner merge onto smaller tmin dataframe by lat, long, time
    merged = dfmin.merge(dfmax,how='inner', on=['latitude','longitude','time'])
    del dfmin, dfmax

    #inner merge by lat,long,time. 
    data_col = 'tn-tx'
    merged[data_col] = merged['tx'] - merged['tn']

    #subtract min and max columns to get daily temp range
    merged[data_col] = merged['tx'] - merged['tn']

    #II. Monthly average of daily temperature variation
    df = merged.groupby([merged.longitude, merged.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


### RR - Precipitation

In [None]:
# #set generic main dataframe for precip sum by year

aggtype = 'yearly-sum'
agg_df=[]
data_col = 'rr'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    
    #get annual precip sum (1 feature)
    df =  main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df.columns = ['longitude','latitude','time',aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")


In [None]:
# #set generic main dataframe for precip daily mean by year. 

aggtype = 'yearly-mean'
agg_df=[]
data_col = 'rr'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow') 
    
    #get annual precip mean (1 feature)
    df =  main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time',aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

In [None]:
#january total precipitation by year

aggtype = 'january-sum'
agg_df=[]
data_col = 'rr'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    #select january before groupby
    df = main_df[main_df['time'].dt.month.isin([1])].groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)
    del df

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

In [None]:
# total wet days in the year, drop days with 0 rainfall and then aggregate

aggtype = 'total-wet-days'
agg_df=[]
data_col = 'rr'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    df = main_df[main_df['rr']!=0].groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['count']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

In [None]:
#summer wet days, any precipitation between .05 - 1 inch

#select "summer" (3,4,5), then apply conditional if precip >.05in and <=1in then 1 else 0. Then groupby year
aggtype = 'wet-summer-days'
agg_df=[]
data_col = 'rr'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    df = main_df[main_df['time'].dt.month.isin([6,7,8])]
    df[data_col] = df.apply(lambda row: 1 if ((row[data_col] > pr.inch2mm(.05)) & (row[data_col] <= pr.inch2mm(1))) else 0, axis=1)
    
    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['sum']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

### HU - Relative Humidity

In [None]:
# #monthly minimum humidity - take min monthly humidity then average over year

aggtype = 'monthly-min'
agg_df=[]
data_col = 'hu'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['min']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

In [None]:
# #monthly minimum humidity - take mean monthly humidity rainfall then average over year

aggtype = 'monthly-mean'
agg_df=[]
data_col = 'hu'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")

In [None]:
# #monthly minimum humidity - take mean monthly humidity rainfall then average over year

aggtype = 'monthly-max'
agg_df=[]
data_col = 'hu'
for year in range(2000,2023):
    weather_filename = f'../../data/climate/0.1deg/eobs-eu-climate-parquet-2000-2023/' + f'{data_col}/{data_col}_ens_mean_0.1deg_reg_{year}_v29.0e.parquet'
    main_df = pd.read_parquet(weather_filename, engine='pyarrow').reset_index(drop=True)
    print(year)
    
    df = main_df.groupby([main_df.longitude, main_df.latitude, pd.Grouper(key='time',freq="M")]).agg({f"{data_col}":['max']}).reset_index()
    df.columns = ['longitude','latitude','time', data_col]

    df = df.groupby([df.longitude, df.latitude, pd.Grouper(key='time',freq="AS-DEC")]).agg({f"{data_col}":['mean']}).reset_index()
    df.columns = ['longitude','latitude','time', aggtype]

    agg_df.append(df)

df = pd.concat(agg_df).reset_index(drop=True)
write_filename = f'../../data/feature_engineering/0.10deg/{data_col}-{aggtype}-by-year.parquet'
df.to_parquet(write_filename, compression ='snappy')
print(f"{data_col}, {aggtype} written to local disk at: {write_filename}")