# Combine and cleanup 33 years of Landsat NIRv data over entire spatiotemporal domain
Last updated: Kevin Varga, 11/19/2024

**Inputs:**
* WRF land mask
* Yearly Landsat derived NIRv geotiffs with semi-monthly, 1km frequency <br>

**Outputs:**
* /home/sbarc/students/varga/nasa/ch1/data/predictors/nirv.nc - NIRv

In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

In [2]:
def xr_landsat(da, lat, lon):
    """Clean up NIRv geoTIFFs downloaded from GEE algorithm and convert to WRF grid"""
    da = da.rename({'y':'latitude', 'x':'longitude', 'band':'time'})
    # Create time variable from the NIRv data array attributes
    da['time'] = list(pd.to_datetime(da.attrs['long_name']))
    da.name = 'nirv'
    # Move spatial reference from variable to attributes
    da.attrs = da['spatial_ref'].attrs
    da = da.drop('spatial_ref')
    da.attrs.pop('long_name', None)
    # Convert NIRv grid to WRF grid
    da = da.interp(latitude = wrf_lats, longitude = wrf_lons)
    # Apply land mask
    da = da.where(land_mask, drop=True)
    # Remove any values equal to or below zero, which signifies water
    da = da.where(da >= 0, np.nan)
    # Remove any values above 0.5, an area threshold maximum identified
    da = da.where(da < 0.5, np.nan)
    return da

In [3]:
# Function to describe an xarray data array
def describe_da(da):
    """Generate descriptive statistics for an xarray DataArray."""
    stats = {}
    #stats['count'] = da.notnull().sum().item()
    stats['mean'] = da.mean().item()
    stats['std'] = da.std().item()
    stats['min'] = da.min().item()
    stats['25%'] = da.quantile(0.25).item()
    stats['50%'] = da.quantile(0.5).item()
    stats['75%'] = da.quantile(0.75).item()
    stats['max'] = da.max().item()
    
    return stats

In [4]:
# Define paths for landsat and wrf data
ls_path = '/home/sbarc/students/varga/nasa/ch1/data/landsat/grid/'
wrf_path = '/home/sbarc/students/varga/nasa/ch1/data/wrf/vars/'

In [5]:
# Create list of all Landsat files
ls_list = sorted(list(Path(ls_path).glob('*.tif')))

In [7]:
# Open WRF land-sea mask file for water grid point masking and conversion of NIRv grid to WRF grid
land_mask = xr.open_dataarray(wrf_path + 'land_mask.nc')

# Extract lat/lon values
wrf_lats = land_mask['XLAT'].values[:,0]
wrf_lons = land_mask['XLONG'].values[0,:]
# Assign lat/lon coordinates as spatial dimensions and clean up
land_mask['south_north'] = wrf_lats
land_mask['west_east'] = wrf_lons
land_mask = land_mask.rename({'south_north':'latitude', 'west_east':'longitude'})
ex_coords = ['XLAT','XLONG','XTIME']
land_mask = land_mask.drop(ex_coords)

# Create mask for land
land_mask = (land_mask >= 1)

In [7]:
# Create list to store data arrays
data_arrays = []

# Loop through NIRv files, clean up, and store in list
for file in ls_list:
    da = xr.open_dataarray(file)
    da = xr_landsat(da, wrf_lats, wrf_lons)
    data_arrays.append(da)

# Concatenate all NIRv data arrays together
cat_da = xr.concat(data_arrays, dim='time')

In [9]:
# Fill missing semi-monthly NIRv with nan
cat_da = cat_da.resample(time='SMS').ffill(0)

# crop time domain to match WRF time domain
cat_da = cat_da.sel(time = slice('1987-07-01','2019-06-15'))

# Interpolate nan values with temporal data gaps less than 90 days
cat_da = cat_da.interpolate_na(dim='time', method='linear', max_gap=pd.Timedelta(90, 'd')) #, fill_value='extrapolate')

In [10]:
# explore stats of data array
stats = describe_da(cat_da)
stats['n_nan'] = cat_da.isnull().sum().item()
stats

{'mean': 0.08111506997979254,
 'std': 0.04067716991175863,
 'min': 5.78068304931122e-09,
 '25%': 0.05136389581511788,
 '50%': 0.07295980978304087,
 '75%': 0.10302659459301938,
 'max': 0.49542743660988126,
 'n_nan': 11982205}

In [11]:
# Save as netcdf
cat_da.to_netcdf('/home/sbarc/students/varga/nasa/ch1/data/predictors/nirv.nc')
cat_da.to_netcdf('/home/sbarc/students/varga/nasa/ch1/data/predictors/daily/nirv.nc')