# Get cell processing info

In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import pyproj
import pickle
from shapely.geometry import box
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from file_checks import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("basic parameters: \n brdf_dir = {} \n grid_cell = {} \n index_dir = {} \n home_dir = {}"
      .format(basic_config['brdf_dir'],basic_config['grid_cell'],basic_config['index_dir'],basic_config['home_dir']))
%store -r single_output_params
print("single_output_params: \n map_years = {}".format(single_output_params['map_years']))
%store -r single_plot_params
print("single_plot_params: \n image_type = {}".format(single_plot_params['image_type']))

In [None]:
def reconstruct_db(processing_info_path,landsat_path,sentinel2_path,brdf_path,modified):
    '''
    This checks for an existing processing database and creates one if needed from download and brdf folders.
    All downloaded images are assumed to be in download folders (landsat and sentinel2 still). If cleaning has
    already occured, this won't work. TODO: make option to construct from brdf folder only if cleaning has already occured.
    Note: this will not create the numpix and coreg shift_x and shift_y columns in the original db, nor any error notes,
    so best to use original database whenever possible.
    '''
    if os.path.exists(landsat_path):
        num_landsat_files = len([fi for fi in os.listdir(landsat_path) if fi.endswith('.tif')])
    else:
        num_landsat_files = 0
    if os.path.exists(sentinel2_path):
        num_sentinel2_files = len([fi for fi in os.listdir(sentinel2_path) if fi.endswith('.tif')])
    else:
        num_sentinel2_files = 0

    if num_landsat_files + num_sentinel2_files == 0:
        print('no (new)images have been downloaded')
    else:
        ## Make new processing db if it does not already exist:
        if not processing_info_path.is_file():
            processing_dict = {}
            if num_landsat_files > 0:
                landsat_files = [fi for fi in os.listdir(landsat_path) if fi.endswith('.tif')]
                for f in landsat_files:
                    processing_dict[os.path.splitext(f)[0]]={'dl':'{}/{}'.format(landsat_path,f),'beforeDB':True}
            if num_sentinel2_files > 0:
                sentinel2_files = [fi for fi in os.listdir(sentinel2_path) if fi.endswith('.tif')]
                for s in sentinel2_files:
                    processing_dict[os.path.splitext(s)[0]]={'dl':'{}/{}'.format(sentinel2_path,s),'beforeDB':True}
            new_processing_info = pd.DataFrame.from_dict(processing_dict,orient='index')
            new_processing_info.rename_axis('id', axis=1, inplace=True)
            pd.to_pickle(new_processing_info, processing_info_path)
            print(f'{len(new_processing_info)} images downloaded and added to database.')

    processing_db = pd.read_pickle(processing_info_path)
    ## to fix issues from older version of db already created for some cells:
    if 'id' not in processing_db:
        processing_db.rename_axis('id', axis=1, inplace=True)
    #if processing_db.index != 'id':
    #    print('removing original index column and setting it to id column')
    #    processing_db.set_index('id', drop=True, inplace=True)
        
    print(f'{len(processing_db)} records in db. {num_landsat_files} landsat and {num_sentinel2_files} sentinel images in downloads.')

    if len(processing_db) >= num_landsat_files + num_sentinel2_files:
        print('all downloaded images have probably been added to db already')
    else:
        print('adding images to db...')
        new_dls = {}
        landsat_files = [fi for fi in os.listdir(landsat_path) if fi.endswith('.tif')]
        for f in landsat_files:
            if os.path.splitext(f)[0] in processing_db.values:
                continue
            else:
                new_dls[os.path.splitext(f)[0]]={'dl':'{}/{}'.format(landsat_path,f),'beforeDB':True}
        sentinel2_files = [fi for fi in os.listdir(sentinel2_path) if fi.endswith('.tif')]
        for s in sentinel2_files:
            if os.path.splitext(s)[0] in processing_db.values:
                continue
            else:
                new_dls[os.path.splitext(s)[0]]={'dl':'{}/{}'.format(sentinel2_path,s),',beforeDB':True}
        
        if len(new_dls)>0:
            new_dl_db = pd.DataFrame.from_dict(new_dls,orient='index')
            new_dl_db.rename_axis('id', axis=1, inplace=True)
            processing_db.append(new_dl_db)
            modified = True
            
    if os.path.exists(brdf_path):
        if 'brdf' in processing_db:
            print('brdf data already in database')
            
        else: 
            print('adding brdf info to db...')
            processing_db['brdf_id'] = np.nan
            processing_db['brdf_error'] = np.nan
            processing_db['brdf'] = np.nan
            processing_db['bandpass'] = np.nan
            for idx, row in processing_db.iterrows():
                match=None
                #print(idx)
                for fi in os.listdir(brdf_path):
                    if fi.endswith('.nc'):
                        if idx.startswith('S'):  
                            if (idx.split('_')[1] in fi.split('_')[2]) and (idx.split('_')[2] == fi.split('_')[3]):
                                match = fi
                        elif idx.startswith('L'): 
                            if (idx.split('_')[0] == fi.split('_')[1]) and (idx.split('_')[2] in fi.split('_')[2]) and (idx.split('_')[3] == fi.split('_')[3]):
                                match = fi
                #print(f'match:{match}')
                processing_db.at[idx,'brdf_id']=match
                if match is not None:
                    if match.split('_')[0] == 'L3B':
                        processing_db.at[idx,'bandpass']=True
                    elif match.split('_')[0] == 'L3A':
                        processing_db.at[idx,'bandpass']=False
                
            modified = True
            
        num_coreged_files = len([fi for fi in os.listdir(brdf_path) if fi.endswith('coreg.nc')])
        print(f'{num_coreged_files} images have been coreged')
        if num_coreged_files == 0:
            print('coregistration has not yet occured. Processing database is up to date')
        else:
            if 'shift_x' in processing_db:
                print('coreg data has already been added to database')
            else:
                print('adding coreg info to db...')
                processing_db['coreg'] = np.nan
                processing_db['shift_x'] = np.nan
                processing_db['shift_y'] = np.nan
                processing_db['coreg_error'] = np.nan
                for idx, row in processing_db.iterrows():
                    match=None
                    #print(idx)
                    for fi in os.listdir(brdf_path):
                        if fi.endswith('.nc'):
                            if idx.startswith('S'):
                                if (idx.split('_')[1] in fi.split('_')[2]) and (idx.split('_')[2] == fi.split('_')[3]):
                                    match = fi
                            ## right now we are only coreging Sentinel, so this makes no sense 
                            elif idx.startswith('L'): 
                                if (idx.split('_')[0] == fi.split('_')[1]) and (idx.split('_')[2] in fi.split('_')[2]) and (idx.split('_')[3] == fi.split('_')[3]):
                                    match = fi
                    #print(f'match:{match}')
                    if match is not None:
                        if 'coreg' in match:
                            processing_db.at[idx,'coreg']=True
                        else:
                            processing_db.at[idx,'coreg']=False
                            if idx.startswith('S'):
                                processing_db.at[idx,'coreg_error']='unknown'
                
                modified = True
                        
    else:
        print('brdfs have not yet been created. Processing database is up to date')

    if modified == True:
        pd.to_pickle(processing_db, processing_info_path)
        print('saving new database')

In [None]:
'''
for cellid in range(3871,3872):
    processing_info_path = Path('{}/{:06d}/processing.info'.format(basicConfig['raw_dir'],cellid))
    landsat_path = Path('{}/{:06d}/landsat'.format(basicConfig['raw_dir'],cellid))
    sentinel2_path = Path('{}/{:06d}/sentinel2'.format(basicConfig['raw_dir'],cellid))
    brdf_path = Path('{}/{:06d}/brdf'.format(basicConfig['raw_dir'],cellid))

    print('processing {}...'.format(cellid))
    if not os.path.exists(landsat_path):
        continue
    if processing_info_path.is_file():
        reconstructed_dbs = []
        deleted_dbs = []
        processing_db = pd.read_pickle(processing_info_path)
        if 'shift_x' in processing_db:
            print ('already has db with shift x')
            if len(processing_db['brdf_id'].unique()) < 10:
                print('this db was created without unique brdf ids')
                processing_db.drop(['brdf','bandpass','brdf_error','brdf_id','coreg','shift_x','shift_y','coreg_error'], axis=1, inplace=True)
                pd.to_pickle(processing_db, processing_info_path)
                reconstructed_dbs.append(cellid)
        elif 'numpix' in processing_db and 'bdrf_id' in processing_db:
            if len(processing_db['brdf_id'].unique()) < 10:
                print('this db was created without unique brdf ids')
                processing_db.drop(['brdf','bandpass','brdf_error','brdf_id'], axis=1, inplace=True)
                pd.to_pickle(processing_db, processing_info_path)
                reconstructed_dbs.append(cellid)
        elif 'numpix' not in processing_db:
            print('deleting existing db')
            processing_info_path.unlink()
            deleted_dbs.append(cellid)
    else:
        print('no existing database. making new database')
        
    reconstruct_db(processing_info_path,landsat_path,sentinel2_path,brdf_path,modified=False)
print('restructured dbs:{}'.format(reconstructed_dbs))
print('deleted dbs:{}'.format(deleted_dbs))
'''

In [None]:
processing_info_path = Path('{}/{:06d}/processing.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell'])))
landsat_path = Path('{}/{:06d}/landsat'.format(basic_config['raw_dir'],int(basic_config['grid_cell'])))
sentinel2_path = Path('{}/{:06d}/sentinel2'.format(basic_config['raw_dir'],int(basic_config['grid_cell'])))
brdf_path = Path(basic_config['brdf_dir'])
modified = False
reconstruct_db(processing_info_path,landsat_path,sentinel2_path,brdf_path,modified=False)
processing_db = pd.read_pickle(Path('{}/{:06d}/processing.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))))
#processing_db.set_index('id', drop=True, inplace=True)
#processing_db.drop(['brdf','bandpass','brdf_id','coreg','brdf_error','shift_x','shift_y','coreg_error'], axis=1, inplace=True)
#pd.to_pickle(processing_db, processing_info_path)
processing_db.tail(n=10)

In [None]:
brdf_db = pd.read_pickle(Path(brdf_path/'scene.info'))
brdf_db.tail(n=10)

In [None]:
##View processing errors
processing_errors1 = processing_db[processing_db['redownload']==True]
processing_errors2 = processing_db[~processing_db['brdf_error'].isnull()]
processing_errors = pd.concat([processing_errors1, processing_errors2],axis=0)
print('of the {} images available, {} were not processed due to errors'.format(processing_db.shape[0],processing_errors.shape[0]))
processing_errors

In [None]:
##View brdf status
processed0 = processing_db[processing_db['skip']!=True]
processed = processed0[processed0['redownload']!=True]
no_brdf = processed[processed['brdf']==False | processed['brdf'].isnull()]
print('of the {} images processed, {} do not have brdf calculations'.format(processed.shape[0],no_brdf.shape[0]))

In [None]:
##View coreg status:
processed_sentinel = processed[processed.index.str.startswith('S')]
creg_sentinel = processed_sentinel[processed_sentinel['coreg']==True]
print('of the {} Sentinel images, {} were coreged'.format(processed_sentinel.shape[0],creg_sentinel.shape[0]))
avg_x_shift = creg_sentinel['shift_x'].mean()
avg_y_shift = creg_sentinel['shift_y'].mean()
med_x_shift = creg_sentinel['shift_x'].median()
med_y_shift = creg_sentinel['shift_y'].median()
print ('shift x: avg:{}, med:{}. shift y: avg:{}, med:{}'.format(avg_x_shift, avg_y_shift, med_x_shift, med_y_shift))

In [None]:
###To get all images in brdf directory:
all_images = print_files_in_directory(basic_config['brdf_dir'],'.nc',print_list=basic_config['print_list'],out_dir=basic_config['home_dir'],data_source='stac')

if basic_config['print_list'] == True:
    print('full dataframe is printed as FileList.txt in {}'.format(out_dir=basic_config['home_dir']))
else:
    print('sample of dataframe: (Not printed to file. Can print by setting printList=True in notebook_params)')
all_images.head(n=5)

## Read scene.info file

In [None]:
import math
p_df = pd.read_pickle(Path('{}/{:06d}/processing.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))))
p_df = p_df.reset_index()
p_df['sensor'] = p_df.apply(lambda x: x['index'].split('_')[0], axis=1)
p_df['shift'] = p_df.apply(lambda x: math.sqrt(math.pow(x['shift_x'],2)+math.pow(x['shift_y'],2)),axis=1)
p_df.set_index('index',inplace=True, drop=True)
#p_df5 = p_df[p_df['sensor']=='LT05']
p_df7 = p_df[p_df['sensor']=='LE07']
p_df7.head(n=50)

# Get cell status from new db

In [None]:
##for all years:
df_all = get_img_list_from_db(basic_config['raw_dir'], basic_config['grid_cell'],single_plot_params['image_type'],yrs=None,data_source='stac')
##for selection of years:
df_slice = get_img_list_from_db(basic_config['raw_dir'], basic_config['grid_cell'],single_plot_params['image_type'],yrs=single_output_params['map_years'],data_source='stac')

df_slice.head(n=5)

In [None]:
status = get_cell_status(basic_config['raw_dir'], '/home/downspout-cel/paraguay_lc/stac/grids', basic_config['grid_cell'],yrs=None,data_source='stac')
print(status)

In [None]:
dl_status_db_path = '/home/downspout-cel/paraguay_lc/cell_processing_dl.csv'

In [None]:
status_db_path = '/home/downspout-cel/paraguay_lc/cell_processing_post.csv'
#update_cell_status_db(status_db_path, range(4050,4101), basic_config['raw_dir'], '/home/downspout-cel/paraguay_lc/stac/grids', yrs=None,data_source='stac')

## To save an html copy of this notebook with all outputs:

In [None]:
### Run to print output as html
outName = str(basic_config['country']+'0_check_and_fill_db_'+str(basic_config['grid_cell']))
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --output=$outName 0_check_and_fill_db.ipynb