# Summary of images processed

In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from file_checks import *

In [None]:
'''
PARAMETERS: modify in Notebook_settings notebook, then run that notebook and this cell to update here
DO not modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n raw_dir = {} \n smooth_dir = {} \n local_dir = {} \n yr_range = {} \n today = {}"
      .format(basic_config['raw_dir'],basic_config['smooth_dir'],basic_config['local_dir'],basic_config['yr_range'],basic_config['today']))
print("spec_indices = {} \n image_summary_path = {} \n dl_db_path = {} \n status_db_path = {}"
      .format(basic_config['spec_indices'],basic_config['image_summary_path'],basic_config['dl_db_path'],basic_config['status_db_path']))

In [None]:
## To get all files processed in brdf directory across all processed cells:
all_images = pd.read_csv(Path(basic_config['image_summary_path']),index_col=[0])
all_images.groupby(['yr','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5),
            title=('Images processed for {}'.format(basic_config['country'])));

In [None]:
used = all_images[all_images['quality']=='image_used']
not_used = all_images[all_images['quality']=='low_quality']
print('of the {} images ingested, {} were used in the final map product and {} were excluded due to quality issues'.format(len(used)+len(not_used),len(used),len(not_used)))
all_images.groupby(['yr','quality']).size().unstack().plot(color=['black','white'], kind='bar', stacked=True, edgecolor = 'black', figsize=(20, 5),
            title=('Image processing results for {}'.format(basic_config['country'])));

### To create / refresh list of processed images
Note this can take ~15 min. Can be run from commandline/bash as 'summarize_images_multicell' 

In [None]:
## uncomment to print new `AllFileList.csv' to local directory:
#all_files = print_files_in_multiple_directories(basic_config['raw_dir'],"brdf",'.nc',print_list=True,out_dir=basic_config['local_dir'])
## or uncomment this to create all_files in memory only (to use for quick/partial checks):
#all_files = print_files_in_multiple_directories(basic_config['raw_dir'],"brdf",'.nc',print_list=False,out_dir=None)

## To check all processed cells for missing files at download:
Note cell_processing_dl.csv is updated whenever check_log_files_dl.job is run
If this gets corrupted (e.g. if the script is run when there is no memory to save), it can be recreated by deleting the corrupted script and moving all of the dl logs from the arcihve folder to the run directory and running check_log_files_dl.job again. If multiple users have downloaded files, each must do this for it to be accounted for.

In [None]:
dl_db = pd.read_csv(Path(basic_config['dl_db_path']),index_col=[0])
dl_fix = dl_db[(dl_db['dl_fix_now']!='[]') & (pd.notnull(dl_db['dl_fix_now']))]
print(dl_fix)

## update full processing status db

In [None]:
#Note: this takes a long time from here, but runs fast on command line
#update_cell_status_db(basic_config['status_db_path'], 'All', basic_config['raw_dir'], basic_config['smooth_dir'])

## Get full processing status

In [None]:
post_db = pd.read_csv(basic_config['status_db_path'],index_col=[0])
post_db.head(n=5)

## Get missing processing steps

In [None]:
# cells without brdf processing yet (but with downloads)
no_brdf = post_db[post_db['num_brdf']!='brdf step not complete']
no_brdf.sort_index(inplace=True)
print('these cells are missing brdf files')
list(no_brdf.index)

In [None]:
# cells without coreg processing yet (but with brdfs)
yes_brdf = post_db[post_db['num_brdf']=='brdf step not complete']
no_coreg = yes_brdf[yes_brdf['num_coreged']=='coreg step not complete']
no_coreg.sort_index(inplace=True)       
print('these cells have not completed coreg (but have brdfs)')
list(no_coreg.index)

##  Get cells with all 6 ts indices complete for (YYYY-YYYY) e.g. 2000-2022

In [None]:
status_db_path = basic_config['status_db_path']
out_path = os.path.join(basic_config['local_dir'],'Cells_with_{}_indices.csv'.format(len(basic_config['spec_indices'])))
post_db = pd.read_csv(Path(status_db_path),index_col=[0])
ts = [col for col in post_db.columns if 'index' in col]
for i in ts:
    post_db[f'check_{i}'] = post_db.apply(lambda x: 1 if (isinstance(x[i], str) and 
                                                            int(x[i].split('-')[0]) <= basic_config['yr_range'][0] and
                                                            int(x[i].split('-')[1]) >= basic_config['yr_range'][1])
                                          else 0, axis=1)
ts_checked = [col for col in post_db.columns if 'check_index' in col]
ts_sum = post_db[post_db.columns.intersection(ts_checked)].sum(axis=1)
ts_good = ts_sum[ts_sum >=len(basic_config['spec_indices'])]
ts_good.to_csv(out_path) 
print('{} cells have at least {} indices from {} to {}'.format(len(ts_good),len(basic_config['spec_indices']),basic_config['yr_range'][0],basic_config['yr_range'][1]))
print('list is printed to {}'.format(out_path))

## get cells with a specific index (e.g. evi2) run but incomplete

In [None]:
post_db = pd.read_csv(basic_config['status_db_path'],index_col=[0])
i = basic_config['spec_index']
out_path = os.path.join(basic_config['local_dir'],'Cells_with_{}_index_started_but_incomplete.csv'.format(basic_config['spec_index']))
post_db_evi = post_db[post_db['index_{}'.format(i)].notnull()]
post_db_evi['stat'] = post_db_evi.apply(lambda x: 0 if (int(x['index_{}'.format(i)].split('-')[0]) > basic_config['yr_range'][0] or
                                                            int(x['index_{}'.format(i)].split('-')[1]) < basic_config['yr_range'][1])
                                          else 1, axis=1)
post_db_incomplete = post_db_evi.loc[post_db_evi['stat']==0]
incomplete = post_db_incomplete['index_{}'.format(i)]
incomplete.to_csv(out_path) 
print('list of incomplete {} indices is printed to {}'.format(i,out_path))

In [None]:
print_files_in_multiple_directories(basic_config['raw_dir'],'comp','base4Poly6_2021_stack.tif',print_list=False,out_dir=None)

### Total number of scenes ingested
Note that a single Lansdat image is broken into ~80 grid cell images (A typical Landsat Scene = 31,000 km2 170 km x 185 km -- our grid cells are 400 km2), so images ingested needs to be divided by 80 to get an estimate of the number of actual Landsat / Sentinel scenes ingested. Method above takes a little longer (~15 min), but is much more precise.

In [None]:
num_images_ingested_2022 = post_db['images_ingested_2022'].sum()
images_ingested = [col for col in post_db.columns if 'images_ingested' in col and 'All' not in col]
post_db['images_ingested_All'] = post_db[images_ingested].sum(axis=1)
num_images_ingested = post_db['images_ingested_All'].sum()
print('About {} images ingested in total for 2022 single-year product'.format(num_images_ingested_2022 // 80))
print('About {} images ingested in total 2000-2022 product'.format(num_images_ingested // 80))

### Total number of scenes used 
#### Excludes scenes not coregistered or not used for other data quality issues
Note comment about cells vs. images above. Method above takes a little longer (~15 min), but is much more precise.

In [None]:
num_images_used_2022 = post_db['images_used_2022'].sum()
images_used = [col for col in post_db.columns if 'images_used' in col and 'All' not in col]
post_db['images_used_All'] = post_db[images_used].sum(axis=1)
num_images_used = post_db['images_used_All'].sum()
print('About {} images used in total for 2022 single-year product'.format(num_images_used_2022 // 80))
print('About {} images used in final map product for 2000-2022'.format(num_images_used // 80))

In [None]:

all_files.groupby(['yr','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5),
            title=('Images processed for {}'.format(basic_config['country'])));

## To save an html copy of this notebook with all outputs:

In [None]:
### Run to print output as html

out_name = str(basic_config['country']+'5a_ImagesProcessed_'+basic_config['today'])
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --output=$out_name 5a_SummarizeData_ImagesProcessed.ipynb