# Finding thumbnails

In [None]:
import pandas as pd
from shutil import copytree
from shutil import rmtree
import hubmapbags
import pathlib
import json
import yaml

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)

from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning
from datetime import datetime
from time import sleep

instance = 'prod'
token = ''

In [None]:
now = datetime.now()
output_directory = 'data'
if not Path(output_directory).exists():
    Path(output_directory).mkdir()

report_output_directory = 'reports'
if not Path(report_output_directory).exists():
    Path(report_output_directory).mkdir()

report_output_filename = report_output_directory + '/' + str(now.strftime('%Y%m%d')) + '.tsv'
print(report_output_filename)

In [None]:
def is_primary( hubmap_id, instance='prod', token=None ):
    metadata = hubmapbags.apis.get_ancestors_info( hubmap_id, instance=instance, token=token )
    if 'entity_type' in metadata[0].keys() and  metadata[0]['entity_type'] == 'Sample':
        return True
    else:
        if 'error' in metadata[0]:
            warning(metadata[0]['error'])
        return False
    
def has_metadata( metadata ):
    if 'ingest_metadata' in metadata.keys() and 'metadata' in metadata['ingest_metadata'].keys():
        return True
    else:
        return False

In [None]:
if not Path(report_output_filename).exists():
    # get assay types
    assay_names = hubmapbags.apis.get_assay_types( token=token )

    report = pd.DataFrame()
    for assay_name in assay_names:
        print(assay_name)
        datasets = pd.DataFrame(hubmapbags.get_hubmap_ids( assay_name=assay_name, token=token ))

        if datasets.empty:
            continue

        #clean up
        datasets = datasets[(datasets['data_type'] != 'image_pyramid')]
        datasets = datasets[(datasets['status'] == 'Published')]
        
        for index, datum in tqdm(datasets.iterrows()):
            datasets.loc[index, 'directory'] = hubmapbags.apis.get_directory( datum['hubmap_id'], instance='prod', token=token )
            metadata = hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance='prod', token=token )
            
            if has_metadata( metadata ):
                datasets.loc[index,'has_metadata'] = True
                if 'ingest_metadata' in metadata.keys():
                    datasets.loc[index,'assay_type'] = metadata['ingest_metadata']['metadata']['assay_type']
                    datasets.loc[index,'assay_category'] = metadata['ingest_metadata']['metadata']['assay_category']
                else:
                    datasets.loc[index,'assay_type'] = None
                    datasets.loc[index, 'assay_category'] = None
            else:
                datasets.loc[index,'has_metadata'] = False
            
        report = pd.concat([report,datasets])
    
    report = report[['group_name','uuid','hubmap_id','status','is_protected','data_type','assay_type','assay_category','directory']]
    report.to_csv( report_output_filename, sep='\t', index=False )
    report.to_pickle( report_output_filename.replace('tsv','pkl') )
else:
    print('File found on disk. Loading ' + report_output_filename + '.' )
    report = pd.read_csv( report_output_filename, sep='\t' )

In [None]:
def find_thumbnail( datum ):
    if datum['is_protected']:
        return ''
    else:
        if Path(datum['directory']).exists():
            files = list(Path(datum['directory']).glob('**/*thumbnail.jpg'))
            return str(files)
        else:
            return ''
    
def get_thumbnail( metadata ):
    if 'ingest_metadata' in metadata.keys() and 'thumbnail_file_abs_path' in metadata['ingest_metadata'].keys():
        return metadata['ingest_metadata']['thumbnail_file_abs_path']
    else:
        return None

files = []
for index, datum in tqdm(report.iterrows()):
    metadata = hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance='prod', token=token )
    report.loc[index,'entity.thumbnail_file_abs_path'] = get_thumbnail( metadata )
    files.append(find_thumbnail( datum ))

In [None]:
report['files'] = files

In [None]:
report_output_filename = report_output_directory + '/thumbnails.tsv'
report.to_csv( report_output_filename, sep='\t', index=False )
print(report_output_filename)

In [None]:
report = report[['hubmap_id','status','is_protected','data_type','assay_category','entity.thumbnail_file_abs_path','files']]

In [None]:
print(report[~report['entity.thumbnail_file_abs_path'].isnull()].to_markdown(tablefmt="grid"))