# Append History Data
In this notebook, data from the `pseudo-absence generation` step is further processed to add 95 days history day for all temporal variables.

To achieve this, the NASA dataset is written to a database, then queries are made to fetch data of interest.

### Create Database

In [None]:
import pandas as pd
import sqlalchemy as db
from dateutil.relativedelta import relativedelta 
import glob
import xarray
import multiprocessing as mp
from functools import partial
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
# define variables

#path to NASA data
NASA_basePath = '/mnt/disks/nasa/NASA' 

temporal_variables = [
    'AvgSurfT_inst', 
    'Albedo_inst', 
    'SoilMoi0_10cm_inst', 
    'SoilMoi10_40cm_inst', 
    'SoilTMP0_10cm_inst', 
    'SoilTMP10_40cm_inst', 
    'Tveg_tavg', 
    'Wind_f_inst', 
    'Rainf_f_tavg', 
    'Tair_f_inst',
    'Qair_f_inst', 
    'Psurf_f_inst' 
]

In [None]:
# geo location gridding

resx, resy = (0.25, 0.25)

lat_to_bucket_id = lambda x: int((x+90)/resy)
lon_to_bucket_id = lambda x: int((x+180)/resx)

bucket_id_to_lat = lambda x: (x*resy) - 90
bucket_id_to_lon = lambda x: (x*resx) - 180

# date arithmetic

def add_days(current_index, days):
    return (pd.to_datetime(current_index[0]) + relativedelta(days=days), current_index[1], current_index[2])

In [None]:
# setting up database and table

table_name = "nasa_noah_data"
engine = db.create_engine('sqlite:///NASA_GLDAS_NOAH025_3H.db')
connection = engine.connect()
metadata = db.MetaData()
nasa_noah_data = db.Table(table_name, metadata, autoload=True, autoload_with=engine)

In [None]:
# Do THIS ONLY ONCE
# writing all NASA data to database

db_start_date = pd.to_datetime("2000-01-01")
db_end_date = pd.to_datetime("2021-12-31")

current_date = db_start_date
j = 0
while current_date <= db_end_date:
    if current_date.is_year_start:
        print(current_date)
    year, month, day = list(map(int, str(current_date.date()).split('-')))
    base_name = f"{NASA_basePath}/GLDAS_NOAH025_3H.A{year}{str(month).zfill(2)}"
    files_pattern = f"{base_name}{str(day).zfill(2) }*.nc4"
    try:
        data = xarray.open_mfdataset(files_pattern, parallel=True)
        data = data.mean(dim="time", skipna=True)
        data = data[temporal_variables].to_dataframe().dropna(axis=0, how='all').reset_index()
        data['lat_bucket_id'] = data['lat'].apply(lat_to_bucket_id)
        data['lon_bucket_id'] = data['lon'].apply(lon_to_bucket_id)
        data['year'] = year
        data['month'] = month
        data['day']  = day
        data["date"] = pd.to_datetime(data[['month', 'day', 'year']])
        data.index += j
        data.to_sql(table_name, engine, if_exists='append')
        j = data.index[-1] + 1
    except:
        print(f"Cannot read {current_date} data")
    current_date += relativedelta(days=1)

### Append History 

In [None]:
# path to output of pseudo-absence generation notebook
csv_filepath = '../data/train_val_random_geo.csv'
data = pd.read_csv(csv_filepath)

In [None]:
# February has <= 28 days
data.loc[((data['month']==2) & (data['day'] > 28)), 'day'] = 28
data["date"] = pd.to_datetime(data[['month', 'day', 'year']])
data["observation_date"] = data["date"]
data['lat_bucket_id'] = data['y'].apply(lat_to_bucket_id)
data['lon_bucket_id'] = data['x'].apply(lon_to_bucket_id)

In [None]:
stats = data[['lat_bucket_id', 'lon_bucket_id']].describe()
lat_min, lon_min = stats.loc['min']
lat_max, lon_max = stats.loc['max']
stats

In [None]:
data = data[['date', 'lat_bucket_id', 'lon_bucket_id', 'x', 'y', 'presence', 'year', 'month', 'day', 'clay_0.5cm_mean', 'clay_5.15cm_mean', 'sand_0.5cm_mean', 'sand_5.15cm_mean', 'silt_0.5cm_mean', 'silt_5.15cm_mean', 'observation_date']]
data = data.set_index(['date', 'lat_bucket_id', 'lon_bucket_id'])
data

In [None]:
def add_history_parallel(year, data):
    # February has <= 28 days
    data.loc[((data['month']==2) & (data['day'] > 28)), 'day'] = 28
    data["date"] = pd.to_datetime(data[['month', 'day', 'year']])
    data["observation_date"] = data["date"]
    data['lat_bucket_id'] = data['y'].apply(lat_to_bucket_id)
    data['lon_bucket_id'] = data['x'].apply(lon_to_bucket_id)
    data = data[['date', 'lat_bucket_id', 'lon_bucket_id', 'x', 'y', 'presence', 'method', 'year', 'month', 'day', 'clay_0.5cm_mean', 'clay_5.15cm_mean', 'sand_0.5cm_mean', 'sand_5.15cm_mean', 'silt_0.5cm_mean', 'silt_5.15cm_mean', 'observation_date']]
    data = data.set_index(['date', 'lat_bucket_id', 'lon_bucket_id'])
    

    start_date = str((relativedelta(days=-95) + pd.to_datetime(f"{year}-01-01")).date())
    end_date = str((relativedelta(days=365) + pd.to_datetime(f"{year}-01-01")).date())
    print(f"Year -> From: {start_date}, To: {end_date}")
    query = db.select([nasa_noah_data]).where(db.and_(
        nasa_noah_data.columns.date >= start_date, 
        nasa_noah_data.columns.date <= end_date,
        nasa_noah_data.columns.lat_bucket_id >= 347,
        nasa_noah_data.columns.lat_bucket_id <= 504,
        nasa_noah_data.columns.lon_bucket_id >= 619,
        nasa_noah_data.columns.lon_bucket_id <= 924,
    ))
    query_result = pd.read_sql_query(query, engine).set_index(['date', 'lat_bucket_id', 'lon_bucket_id'])
    subset = data[data['year'] == year]
    for days in range(0, 96):
        indices = subset['observation_date'].index.map(lambda row: add_days(row, days=-days))
        subset_day_x = query_result.reindex(indices)
        for variable in temporal_variables:
            subset[f"{variable}_{days}"] = list(subset_day_x[variable])
    return subset

In [None]:
# path to output of pseudo-absence generation notebook
filepaths = [
    '../data/train_val_random_geo.csv', 
]

for filepath in filepaths:
    data = pd.read_csv(filepath)
    unique_years = data['year'].unique()

    with mp.Pool(3) as p:
        results = p.map(partial(add_history_parallel, data=data), unique_years)

    output = pd.concat(results).reset_index(drop=True)
    output.to_csv(f"{os.path.splitext(filepath)[0]}_full.csv")