In [None]:
import my_funs 
import numpy as np 
import xarray as xr 
import dask
from dask.diagnostics import ProgressBar
import glob
import os

In [None]:
from bs4 import BeautifulSoup
import requests
import re
def get_filenames(host, name_key):
    """ Get all the link names of files from NOAA repo
    Argument:
    host:: NOAA repo
    name_key:: a keyword common in all file names (e.g. AVHRR)
    """
    req = requests.get(host)
    soup = BeautifulSoup(req.text, features="html.parser")
    pattern = re.compile(name_key)
    filenames = []
    for link in soup.find_all("a", href=pattern):
        fname = link.get('href')
        filenames.append(fname)
    return filenames

In [None]:
# Download data LAI/FPAR

f_exist = glob.glob("/home/hamid/NASA_ABoVE/greeness/data/raw_data/noaa_cdr/lai_fpar/clipped/*")
already_downloaded= []
for f in f_exist:
    filepath = f
    basename = os.path.basename(filepath)
    already_downloaded.append(basename[8:])
print(len(already_downloaded))

data_dir = "/home/hamid/NASA_ABoVE/greeness/working/data/"
years = np.arange(1984,2014)
shp_file = data_dir + 'shp_files/CoreDomain_geographic.shp'
tasks = []
for year in years:

    # host of the data
    host = 'https://www.ncei.noaa.gov/data/avhrr-land-leaf-area-index-and-fapar/access/' + str(
        year) + '/'
    # Get all file names in that year directory
    filenames = get_filenames(host=host, name_key="AVHRR")

    filenames = filenames
    for file_name in filenames:
        # print(file_name)
        tmp = my_funs.clip_noaa_parallel(file_name, host, shp_file, data_dir+'raw_data/noaa_cdr/lai_fpar/','lai')
        tasks.append(tmp)
print(len(tasks))

# with ProgressBar():
#     dask.compute(*tasks)
# ds = xr.open_mfdataset(data_dir+'raw_data/noaa_cdr/lai_fpar/*.nc')
# ds.to_zarr(data_dir+'processed_data/noaa_nc/lai_fpar/noaa_lia_fpar_clipped_raw.zarr')

In [None]:
# Download data NDVI
f_exist = glob.glob("/home/hamid/NASA_ABoVE/greeness/data/raw_data/noaa_cdr/ndvi/clipped/*")
already_downloaded= []
for f in f_exist:
    filepath = f
    basename = os.path.basename(filepath)
    already_downloaded.append(basename[8:])
print(len(already_downloaded))

data_dir = "/home/hamid/NASA_ABoVE/greeness/working/data/"
years = np.arange(1984,2014)
shp_file = data_dir + 'shp_files/CoreDomain_geographic.shp'
tasks = []
for year in years:
    # host of the data
    host = 'https://www.ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access/' + str(
        year) + '/'
    # Get all file names in that year directory
    filenames = my_funs.get_filenames(host=host, name_key="AVHRR")
    
    # Lazy download and clip each file and then remove the original global file
    for file_name in filenames:
        if file_name in already_downloaded:
            continue
        tmp = my_funs.clip_noaa_parallel(file_name, host, shp_file, data_dir+'raw_data/noaa_cdr/ndvi/','ndvi')
        tasks.append(tmp)

print(len(tasks))
# # # Compute the lazy object
# with ProgressBar():
#     dask.compute(*tasks)
# ds = xr.open_mfdataset(data_dir+'raw_data/noaa_cdr/ndvi/*.nc')
# ds.to_zarr(data_dir+'processed_data/noaa_nc/ndvi/noaa_ndvi_clipped_raw.zarr')

In [None]:
# Download data Reflectance
f_exist = glob.glob("/home/hamid/NASA_ABoVE/greeness/working/data/raw_data/noaa_cdr/reflectance/*")
already_downloaded= []
for f in f_exist:
    filepath = f
    basename = os.path.basename(filepath)
    already_downloaded.append(basename[8:])
print(len(already_downloaded))

data_dir = "/home/hamid/NASA_ABoVE/greeness/working/data/"
years = np.arange(1984,2014)
shp_file = data_dir + 'shp_files/CoreDomain_geographic.shp'
tasks = []
for year in years:
    # host of the data
    host = 'https://www.ncei.noaa.gov/data/land-surface-reflectance/access/' + str(
        year) + '/'
    # Get all file names in that year directory
    filenames = my_funs.get_filenames(host=host, name_key="AVHRR")
    filenames = filenames
    # year_dir = data_dir + 'raw_data/noaa_cdr/' + str(year) + '/'
    # # Create seperate directory for each year
    # if not os.path.isdir(year_dir):
    #     os.makedirs(year_dir)
    # Lazy download and clip each file and then remove the original global file
    for file_name in filenames:
        tmp = my_funs.clip_noaa_parallel(file_name, host, shp_file, data_dir+'raw_data/noaa_cdr/reflectance/','reflectance')
        tasks.append(tmp)
# Compute the lazy objectfrom dask.diagnostics import ProgressBar
print(len(tasks))
with ProgressBar():
    dask.compute(*tasks)
ds = xr.open_mfdataset(data_dir+'raw_data/noaa_cdr/reflectance/*.nc')
ds.to_zarr(data_dir+'processed_data/noaa_nc/reflectance/noaa_reflectance_clipped_raw.zarr')