# Gets summary of images in a directory

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import pyproj
import pickle
from shapely.geometry import box
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
sys.path.append(r"../LUCinSA_helpers")
from file_checks import *

In [3]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n brdf_dir = {} \n gridCell = {} \n index_dir = {} \n out_dir = {}"
      .format(basic_config['brdf_dir'],basic_config['grid_cell'],basic_config['index_dir'],basic_config['out_dir']))
%store -r single_output_params
print("SingleOutputParams: \n MapYears = {}".format(single_output_params['map_years']))
%store -r single_plot_params
print("SinglePlotParams: \n iImageType = {}".format(single_plot_params['image_type']))

Basic Parameters: 
 brdf_dir = /home/sandbox-cel/paraguay_lc/stac/grid/003045/brdf 
 gridCell = 3045 
 index_dir = /home/downspout-cel/paraguay_lc/stac/grids/003045/brdf_ts/ms/evi2 
 out_dir = /home/klwalker/data/tmp
SingleOutputParams: 
 MapYears = [2020, 2021]
SinglePlotParams: 
 iImageType = AllRaw


In [4]:
###To get all images in brdf directory:
all_images = print_files_in_directory(basic_config['brdf_dir'],'.nc',print_list=basic_config['print_list'],out_dir=basic_config['out_dir'],data_source='stac')

if basic_config['print_list'] == True:
    print('full dataframe is printed as FileList.txt in {}'.format(out_dir=basic_config['out_dir']))
else:
    print('sample of dataframe: (Not printed to file. Can print by setting printList=True in notebook_params)')
all_images.head(n=5)

FileNotFoundError: [Errno 2] No such file or directory: '/home/sandbox-cel/paraguay_lc/stac/grid/003045/brdf'

In [None]:
#all_images.groupby(['yr','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), title=('Images per year'))
all_images.groupby(['yr','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
            title=('Number images processed per year for {} cell {}'.format(basic_config['country'],basic_config['grid_cell'])));

In [None]:
all_images.groupby(['yrmo','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
        title=('Number images processed per month for {} cell {}'.format(basic_config['country'],basic_config['grid_cell'])));

## Focus on smaller range of years to see month on axis:

In [None]:
img_subset = all_images[all_images["yr"].astype(int) < 2011]
img_subset.groupby(['yrmo','sensor']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 5), 
   title=('Number images processed per month for {} cell {} from 2019 on'.format(basic_config['country'],basic_config['grid_cell'])));

## Read full downloading info file

In [None]:
all_processing = "/raid-cel/r/downspout-cel/paraguay_lc/cell_processing_dl.csv"

## Read scene.info file

In [None]:
df = pd.read_pickle(Path('{}/{:06d}/{}/scene.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']),single_plot_params['image_type'].lower())))
print(len(df))
df.tail(n=5)

In [None]:
brdf_df = pd.read_pickle(Path('{}/scene.info'.format(basic_config['brdf_dir'],int(basic_config['grid_cell']))))
#brdf_df.set_index('id',drop=True,inplace=True)
#brdf_df.rename(columns={'out_id':'brdf_id','error':'brdf_error'},inplace=True)
brdf_df.tail(n=15)

In [None]:
p_df = pd.read_pickle(Path('{}/{:06d}/processing.info'.format(basic_config['raw_dir'],int(basic_config['grid_cell']))))
#processing_db = p_df.rename_axis(None,axis=1,inplace=True)
#pdict = p_df.to_dict(orient='index')
#new_processing_info = pd.DataFrame.from_dict(pdict,orient='index')
#new_processing_info.rename_axis('id', axis=1, inplace=True)
#pd.to_pickle(new_processing_info, os.path.join(basicConfig['out_dir'],'processing_db'))


#p_df.index = p_df.index.set_names(['id'])
#p_df.reset_index(drop=False, inplace=True)
p_df.tail(n=50)

In [None]:
##for all years:
#dfAll = get_img_list_from_db(SinglePlotParams['imageType'], basicConfig['raw_dir'], basicConfig['gridCell'],yrs=None,data_source='stac')
##for selection of years:
dfSlice = get_img_list_from_db(basic_config['brdf_dir'], basic_config['grid_cell'],single_plot_params['image_type'],yrs=single_output_params['map_years'],data_source='stac')
dfSlice.head(5)

In [None]:
CatList = get_img_list_from_cat(single_plot_params['image_type'],basic_config['grid_cell'], basic_config['grid_file'], yrs=single_output_params['map_years'])

In [None]:
missing_files = compare_files_to_db(single_plot_params['image_type'], 'Both', basic_config['raw_dir'], basic_config['grid_cell'], basic_config['grid_file'], yrs=single_output_params['map_years'],data_source=basic_config['data_source'])

## Check processing for cell

In [None]:
get_cell_status(basic_config['raw_dir'],basic_config['grid_cell'],basic_config['grid_file'],yrs = [2010,2022])

##TO ADD:
    ##Check if brdfs have been generated
    ## if yes, check for...
    ##Check coreg
      ##check non-coreged images
    ##Check indices

In [None]:
#catfiff = ComparePlanetaryHub_w_Element84 ('Sentinel', basicConfig['gridCell'],basicConfig['gridFile'],Yrs = [2010,2022])
diff0 = pd.DataFrame(catdiff[0])
diff0['date'] = diff0.apply(lambda x: x[0].split("_")[2], axis=1)
diff0.sort_values(by=['date'], inplace=True) 
diff1 = pd.DataFrame(catdiff[1])
diff1['date'] = diff1.apply(lambda x: x[0].split("_")[2], axis=1)
diff1.sort_values(by=['date'], inplace=True)    
print('{} Images in Element84 but not Planetary: {}'.format(len(catdiff[0]),diff0))
print('{} Images in Planetary but not Element84: {}'.format(len(catdiff[1]),diff1))

## Check original products

In [None]:
l_orig = get_img_from_planetary_hub(catlist[0])
import rich.table

table = rich.table.Table("Asset Key", "Descripiption")
for asset_key, asset in l_orig.assets.items():
    # print(f"{asset_key:<25} - {asset.title}")
    table.add_row(asset_key, asset.title)

table

In [None]:
#get url to download a band/asset:
url=img.assets["nir08"].href
print(url)

# Get cell status from new db

In [5]:
##for all years:
#dfAll = GetImgListFromDb(basicConfig['raw_dir'], basicConfig['gridCell'],SinglePlotParams['imageType'],Yrs=None,dataSource='stac')
##for selection of years:
dfSlice = get_img_list_from_db(basic_config['raw_dir'], basic_config['grid_cell'],single_plot_params['image_type'],yrs=single_output_params['map_years'],data_source='stac')

dfSlice.head(n=5)

filtering returned dataset to AllRaw...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lan['date'] = df_lan.index.map(lambda x: int(x.split('_')[3][:8]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sen['date'] = df_sen.index.map(lambda x: int(x.split('_')[2][:8]))


id,dl,beforeDB,dldate,numpix,skip,redownload,skip_reason,error,sensor,date
LE07_L2SP_229074_20200106_02_T2,,,,,True,False,L7 after l7_stop_year,,le07,20200106
LC08_L2SP_229074_20200114_02_T1,/home/sandbox-cel/paraguay_lc/stac/grid/003045...,False,2023-07-01,1890522.0,,,,,lc08,20200114
LE07_L2SP_229074_20200122_02_T1,,,,,True,False,L7 after l7_stop_year,,le07,20200122
LC08_L2SP_229074_20200130_02_T2,/home/sandbox-cel/paraguay_lc/stac/grid/003045...,False,2023-07-01,3220.0,,,,,lc08,20200130
LE07_L2SP_229074_20200207_02_T1,,,,,True,False,L7 after l7_stop_year,,le07,20200207


In [None]:
(note, there are some methods in file_checks.py to get status of single (and all cells -- but this is done in notebook 5). Most were written before this db was created. Need to consolidate & finish processing checks.

## To save an html copy of this notebook with all outputs:

In [None]:
### Run to print output as html
out_name = str(basic_config['country']+'1a_ImagesProcessed_in_Cell_'+str(basic_config['grid_cell']))
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --output=$out_name 1a_ExploreData_FileContent.ipynb