# Gets summary of images in a directory

In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import pyproj
import pickle
from shapely.geometry import box
import matplotlib.pyplot as plt
from PIL import Image

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from file_checks import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n brdf_dir = {} \n smooth_dir = {}, gridCell = {} \n local_dir = {} \n filter_yr = {} \n yr_range = {} \n image_type = {} \n sensor_code = {}"
      .format(basic_config['brdf_dir'],basic_config['smooth_dir'],basic_config['grid_cell'], basic_config['local_dir'], basic_config['filter_yr'], basic_config['yr_range'], basic_config['image_type'], basic_config['sensor_code']))

#### To get all images in download directories: 
note: these directories may have been cleaned out already; this is only useful for internal troubleshooting

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    all_images_landsat = print_files_in_directory(Path('{}/{:06d}/landsat'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))),'.tif',print_list=basic_config['print_list'],out_dir=basic_config['local_dir'],data_source='stac')
    all_images_sentinel = print_files_in_directory(Path('{}/{:06d}/sentinel2'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))),'.tif',print_list=basic_config['print_list'],out_dir=basic_config['local_dir'],data_source='stac')

## To get all processed images in brdf directory
This is primarily for troubleshooting, prior to finilization of time series data.
These are all of the files that have been downloaded and processed, but some will be excluded from the final time series 
(For example L5, L7 and Sentinel images that fail coreg are here but not processed further).
For final accounting of files used in time series (and reasons for skipping), use methods below that utilize processing.info

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    all_images_brdf = print_files_in_directory(basic_config['brdf_dir'],'.nc',print_list=basic_config['print_list'],out_dir=basic_config['local_dir'],data_source='stac')

    if basic_config['print_list'] == True:
        print('full dataframe is printed as brdf_fileList.txt in {}'.format(basic_config['local_dir']))
    else:
        print('sample of dataframe: (Not printed to file. Can print by setting printList=True in notebook_params)')
    all_images_brdf.head(n=5)

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    all_images_brdf.groupby(['yr','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
            title=('Number images processed per year for {} cell {}'.format(basic_config['country'],basic_config['grid_cell'])));

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    all_images_brdf.groupby(['yr','quality']).size().unstack().plot(color=['black','white'], kind='bar', stacked=True, edgecolor = 'black', figsize=(20, 5), 
            title=('Number images processed per year for {} cell {}'.format(basic_config['country'],basic_config['grid_cell'])))

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    all_images_brdf.groupby(['yrmo','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
        title=('Number images processed per month for {} cell {}'.format(basic_config['country'],basic_config['grid_cell'])));

## Focus on smaller range of years to see month on axis:
eg. to look for months where data did not get downloaded. 
But NOTE: missing months are not shown as gaps. TODO: spread axis over all months in range to show gaps

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    img_subset = all_images_brdf[(all_images_brdf["yr"].astype(int) >= basic_config['yr_range'][0]) & (all_images_brdf["yr"].astype(int) <= basic_config['yr_range'][1])]
    img_subset.groupby(['yrmo','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
       title=('Number images processed per month for {} cell {} from {} to {}'.format(basic_config['country'],basic_config['grid_cell'], basic_config['yr_range'][0], basic_config['yr_range'][1])));

## Read processing.info database

In [None]:
p_df = pd.read_pickle(Path('{}/{:06d}/processing.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))))
p_df.sort_index(ascending=True).tail(n=5)

#### Look for records from selected year and sensor 
(for troubleshooting)

In [None]:
if basic_config['purpose'] == 'troubleshoot':
    match = p_df.loc[p_df.index.str.contains(str(basic_config['filter_yr']), na = False) & p_df.index.str.contains(basic_config['sensor_code'], na = False)]
    match.tail(n=5)

## get list of images that were skipped (intentionally not processed)
reason for skipping can be found in column (skip_reason)

In [None]:
p_df_skip = p_df[p_df['skip'] == True]
print('{} files were intentionally skipped'.format(len(p_df_skip)))
print('reasons for skipping files: {}'.format(p_df_skip['skip_reason'].unique().tolist()))
if basic_config['purpose'] == 'troubleshoot':
    print(p_df_skip.index.values)

## get list of images that were not downloaded due to processing errors and marked for redownload 

In [None]:
p_df_redo = p_df[p_df['redownload'] == True]
print('{} files were not processed due to errors and are flagged for reprocessing'.format(len(p_df_redo)))
print('errors in processing files: {}'.format(p_df_redo['error'].unique().tolist()))
if basic_config['purpose'] == 'troubleshoot':
    print(p_df_redo.index.values)

# Get cell summary from processing database

In [None]:
sensor = 'All' if basic_config['image_type'] == 'All' else basic_config['sensor_code'].lower()
## for all years:
df_all = get_img_list_from_db(basic_config['raw_dir'], basic_config['grid_cell'],'All',yrs=None,data_source='stac')

##for selection of years:
#df_slice = get_img_list_from_db(basic_config['raw_dir'], basic_config['grid_cell'],sensor,yrs=basic_config['yr_range'],data_source='stac')


In [None]:
if basic_config['purpose'] == 'troubleshoot':
    if basic_config['image_type'] == 'All' or basic_config['sensor_code'].lower().startswith('l'):
        #catList_landsat = get_img_list_from_cat('l',basic_config['grid_cell'], basic_config['grid_file'], yrs=basic_config['yr_range'])
        #print('{} images were found in the original landsat database for {} from {} to {}'.format(basic_config['grid_file'],basic_config['yr_range'][0],basic_config['yr_range'][1]))
        #print(catList_landsat)
        missing_local_l, missing_remote_l, missing_from_localdb_l = compare_files_to_db('l', 'both', basic_config['raw_dir'], basic_config['grid_cell'], basic_config['grid_file'], yrs=basic_config['yr_range'],data_source=basic_config['data_source'])
        print('{} images from landsat catalog have not been processed'.format(len(missing_from_localdb_l)))
        #print(missing_from_localdb_l)
    if basic_config['image_type'] == 'All' or basic_config['sensor_code'].lower().startswith('s'):
        #catList_sentinel = get_img_list_from_cat('s',basic_config['grid_cell'], basic_config['grid_file'], yrs=basic_config['yr_range'])
        #print('{} images were found in the original sentinel database for {} from {} to {}'.format(basic_config['grid_file'],basic_config['yr_range'][0],basic_config['yr_range'][1]))
        #print(catList_sentinel)
        missing_local_s, missing_remote_s, missing_from_localdb_s = compare_files_to_db('s', 'both', basic_config['raw_dir'], basic_config['grid_cell'], basic_config['grid_file'], yrs=basic_config['yr_range'],data_source=basic_config['data_source'])
        print('{} images from sentinel catalog have not been processed'.format(len(missing_from_localdb_s)))
        #print(missing_from_localdb_s)

## Check processing for grid cell

In [None]:
status, fig1, fig2 = get_cell_status(basic_config['raw_dir'],basic_config['smooth_dir'],basic_config['grid_cell'],None,True,basic_config['local_dir'],'stac')

In [None]:
print(status)

In [None]:
# show plot of images ingested per year by sensor for selected cell
image = Image.open(fig1)
image.show()

In [None]:
# show plot of processing results (images used) by year for selected cell
image = Image.open(fig2)
image.show()

# extra exploration of inputs

## Read scene.info file
 (this is for internal troublshooting. Processed items and status are better checked with processing.info checks above)

In [None]:
if purpose == 'troubleshoot':
    prelim_landsat_df = pd.read_pickle(Path('{}/{:06d}/{}/scene.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']),'landsat'.lower())))
    # pd.DataFrame.to_csv(prelim_landsat_df, Path('{}/{:06d}_landsatList.csv'.format(basic_config['local_dir'],int(basic_config['grid_cell']))), sep=',', na_rep='.', index=False) 
    prelim_sentinel_df = pd.read_pickle(Path('{}/{:06d}/{}/scene.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']),'sentinel2'.lower())))
    prelim_brdf_df = pd.read_pickle(Path('{}/scene.info'.format(basic_config['brdf_dir'],int(basic_config['grid_cell']))))
    print('landsat scene.info has {} items. Sentinel scene.info has {} items. brdf scene.info has {} items.'.format(len(prelim_landsat_df),len(prelim_sentinel_df),len(prelim_brdf_df)))    
    print(prelim_brdf_df)

### Check original products

In [None]:
'''
l_orig = get_img_from_planetary_hub(catlist[0])
import rich.table

table = rich.table.Table("Asset Key", "Descripiption")
for asset_key, asset in l_orig.assets.items():
    # print(f"{asset_key:<25} - {asset.title}")
    table.add_row(asset_key, asset.title)

table
'''

In [None]:
'''
#get url to download a band/asset:
url=img.assets["nir08"].href
print(url)
'''

### explore difference between STAC catalogs

In [None]:
'''
catfiff = ComparePlanetaryHub_w_Element84 ('Sentinel', basicConfig['gridCell'],basicConfig['gridFile'],Yrs = [2000,2022])
diff0 = pd.DataFrame(catdiff[0])
diff0['date'] = diff0.apply(lambda x: x[0].split("_")[2], axis=1)
diff0.sort_values(by=['date'], inplace=True) 
diff1 = pd.DataFrame(catdiff[1])
diff1['date'] = diff1.apply(lambda x: x[0].split("_")[2], axis=1)
diff1.sort_values(by=['date'], inplace=True)    
print('{} Images in Element84 but not Planetary: {}'.format(len(catdiff[0]),diff0))
print('{} Images in Planetary but not Element84: {}'.format(len(catdiff[1]),diff1))
'''

## To save an html copy of this notebook with all outputs:

In [None]:
'''
### uncomment and Run to print output as html
out_name = str(basic_config['country']+'1a_ImagesProcessed_in_Cell_'+str(basic_config['grid_cell']))
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --output=$out_name 1a_ExploreData_FileContent.ipynb
'''