In [None]:
import pandas as pd
import numpy as np
import re
import json
from string import hexdigits
from pprint import pprint

## Inputs:
* big_sheet is the standard spreadsheet of unpublished datasets generated by new_dataset_survey.py .
* multi_file is a text file exported from the shared doc "Multi-dataset run JSON strings"
* published_dataset_uuid_list is a csv file containing the column 'uuid', with the uuids of all existing published datasets.

In [None]:
big_sheet = "/tmp/unpublished_datasets.tsv"

multi_file = '/home/welling/Downloads/Multi-dataset run JSON strings.txt'

published_dataset_uuid_list = "/home/welling/Downloads/unique_published_dataset_uuids.csv"

## some utilities

In [None]:
def is_uuid(s):
    return(len(s) == 32 and all(c in hexdigits for c in s))
def is_hubmap_id(s):
    return(len(s) == 15 and s.startswith('HBM'))

## Process the big table of unpublished datasets

The interesting products of this are:
* **qa_df_short** : all QA datasets
* **qa_primary** : all QA primary datasets
* **qa_derived** : all QA derived datasets
* **matched_groups_df** : each row contains a primary dataset and its single derived child
* **qa_datasets_set** : a set containing the uuids of all datasets in QA

In [None]:
big_df = pd.read_csv(big_sheet, sep='\t')

qa_df = big_df[big_df.status == 'QA']

qa_df_short = qa_df.drop(columns=['donor_hubmap_id', 'donor_submission_id', 'donor_uuid', 'organ', 'provider_experiment_id',
                         'sample_hubmap_id', 'sample_submission_id', 'status', 'has_data', 'has_metadata',
                         'n_md_recs', 'validated', 'last_touch'])

qa_primary = qa_df_short[qa_df_short.is_derived==False]

qa_derived = qa_df_short[qa_df_short.is_derived==True]

qa_datasets_set = set(qa_df_short['uuid'])

In [None]:
thing = pd.merge(how='left', left_on='uuid', right_on='parent_dataset', left=qa_primary, right=qa_derived)

thing=thing.dropna(subset=['uuid_y'])

mismatched_groups = []
matched_groups = []
for idx, row in thing.iterrows():
    if row['group_name_x'] != row['group_name_y']:
        mismatched_groups.append(row)
    else:
        matched_groups.append(row)
mismatched_groups_df = pd.DataFrame(mismatched_groups)
assert len(mismatched_groups_df) == 0
matched_groups_df = pd.DataFrame(matched_groups)

matched_groups_df = matched_groups_df.drop(columns=['is_derived_x', 'is_derived_y', 'parent_dataset_x',
                                                       'group_name_y', 'is_derived_y', 'parent_dataset_y'])

At this point we need to manually check that each of the left-hand uuids (the primary datasets) has only 1 derived dataset.  If there are 2, it would imply that a run meant to be discarded was not properly cleaned up.

In [None]:
groups = matched_groups_df.groupby('uuid_x')

print("Primaries with more than one derived:")
for key, group in groups:
    if len(group) != 1:
        print(key,len(group))
print("(end of list)")

Manually inspect the notes for the matched pairs for signs that the derived dataset should not be published

In [None]:
for idx, row in matched_groups_df.iterrows():
    if not pd.isna(row['note_x']):
        print(f"{row['uuid_x']}: {row['note_x']}")

## Parse the csv of Published dataset uuids
The product here is published_datasets_set, a set containing all Published uuids

In [None]:
published_datasets_df = pd.read_csv(published_dataset_uuid_list)

published_datasets_set = set(published_datasets_df['uuid'])

## The next block contains a parser for the Multiple Dataset Runs table

In [None]:
def create_rec_generator(fname):
    with open(fname) as f:
        for line in f:
            yield line

class Tokenizer():
    def __init__(self, fname):
        self.tok_l = []
        self.rec_num = 0
        self.rec_gen = create_rec_generator(fname)
    def next_line(self):
        self.tok_l += reversed(self.rec_gen.__next__().split())
        self.rec_num += 1
    def get(self):
        try:
            while not self.tok_l:
                self.next_line()
            rslt = self.tok_l.pop()
            #print(f'get: {self.tok_l} <{rslt}>')
            return rslt
        except StopIteration:
            return None
    def pushback(self,tok):
        if tok != '':
            self.tok_l.append(tok)
            #print(f'pushback <{tok}> -> {self.tok_l}')

# tokenizer test
#tzr = Tokenizer(multi_file)
#while True:
#    tok = tzr.get()
#    if tok is None:
#        break

hits = []
state = 'empty'
current_json = ""
tzr = Tokenizer(multi_file)
while True:
    tok = tzr.get()
    #print(f'top: state=<{state} tok=<{tok}>')
    if tok is None:
        break
    elif tok == '':
        #print('skip space')
        pass
    elif state == "empty":
        if '{' in tok:
            parts = tok.split('{', 2)
            current_json = "{"
            state = "in_json"
            #print(f'point 1: {parts}')
            for word in parts:
                tzr.pushback(word)
        else:
            pass
    elif state == 'in_json':
        if '}' in tok:
            parts = tok.split('}', 2)
            current_json = current_json + ' ' + parts[0] + " }"
            state = 'after_json'
            #print(f'point 2: {current_json}')
            tzr.pushback(parts[1])
        else:
            current_json = f"{current_json} {tok}"
            state = 'in_json'
    elif state == 'after_json':
        #print(f'point 4a: <{tok}>')
        if '-' in tok and '>' in tok:
            state = 'after_arrow'
            #print(f'point 4b {tok}')
        else:
            #print(f'point 3 {current_json}')
            hits.append((current_json, None))
            state = 'empty'
    elif state == 'after_arrow':
        #print(f'point 5a: {tok}')
        if is_uuid(tok):
            hits.append((current_json, tok))
            state = 'empty'
        elif '{' in tok:
            hits.append((current_json, None))
            parts = tok.split('{', 2)
            current_json = "{"
            state = "in_json"
            for word in parts:
                tzr.pushback(word)
        else:
            # dangling ->
            hits.append((current_json, None))
            state = 'empty'
              
#    if tzr.rec_num > 100:
#        break

#pprint(hits)

## Sort the multi-dataset run records into categories
The uuids used in the multi-dataset runs are sorted into the following groups:
* input and output uuids that have already been published
* input uuids that are in QA and appear ready to publish
* output uuids that are in QA and appear ready to publish
* input uuids that are actualy HuBMAP id strings and thus cannot be handles by this script
* anomalies, which contains everything else, including un-parsable JSON.

The list of anomalies must be examined manually. Each is a tuple containing the uuid, "output" if the uuid was a run output, and the ordinal number of the multi-dataset run record from which it came.

In [None]:
already_published_l = []
ready_l = []
ready_outputs_l = []
input_hubmap_ids = []
anomalies = []
elt_counter = 0
for json_s, output_uuid in hits:
    #print(f"{json_s} {output_uuid}")
    try:
        dct = json.loads(json_s)
        for uuid in dct['uuid_list']:
            if uuid in published_datasets_set:
                already_published_l.append(uuid)
            elif uuid in qa_datasets_set:
                ready_l.append(uuid)
            elif is_hubmap_id(uuid):
                input_hubmap_ids.append((uuid, elt_counter))
            else:
                anomalies.append((uuid, elt_counter))
    except json.JSONDecodeError:
        anomalies.append((json_s, elt_counter))
    if output_uuid is None:
        pass
    elif output_uuid in published_datasets_set:
        already_published_l.append(output_uuid)
    elif output_uuid in qa_datasets_set:
        ready_outputs_l.append(output_uuid)
    else:
        anomalies.append((output_uuid, "output", elt_counter))
    elt_counter += 1
print(f"input_hubmap_ids: {input_hubmap_ids}")
print('ANOMALIES:')
for elt in anomalies:
    print(elt)
print('end anomalies')

            

## A utility function to assemble the record which will ultimately appear in the tsv file of publishable datasets

In [None]:
def get_uuid_record_dict(uuid, note=None):
    one_row_df = qa_df_short[qa_df_short.uuid == uuid]
    assert len(one_row_df) == 1
    row = one_row_df.iloc[0]
    return {
        'group_name': row['group_name'],
        'data_types': row['data_types'],
        'uuid': row['uuid'],
        'hubmap_id': row['hubmap_id'],
        'note': '' if note is None else note
    }
#dct = get_uuid_record_dict(ready_l[0], note='foo')
#pprint(dct)

## Assemble the final table
This block goes through all the lists and tables of uuids accumulated so far and attempts to produce a DataFrame of publishable datasets.  The notes associated with each row tell which block of code produced each entry and can help unravel mysteries.  

*Straggler entries must be examined by hand.  Most represent some kind of record-keeping error in the input files.**

In [None]:
handled_set = set()
all_recs = []

# This loop covers the inputs to multi runs
for uuid in set(ready_l):
    assert uuid not in handled_set
    all_recs.append(get_uuid_record_dict(uuid, note="no unique child"))
    handled_set.add(uuid)

# This loop covers the outputs from multi runs
for uuid in set(ready_outputs_l):
    assert uuid not in handled_set
    all_recs.append(get_uuid_record_dict(uuid, note="no unique parent"))
    handled_set.add(uuid)

# This loop covers all matched pairs
for idx, row in matched_groups_df.iterrows():
    uuid_primary = row['uuid_x']
    hubmapid_primary = row['hubmap_id_x']
    group = row['group_name_x']
    data_types_primary = row['data_types_x']
    uuid_derived = row['uuid_y']
    hubmapid_derived = row['hubmap_id_y']
    data_types_derived = row['data_types_y']
    if (uuid_primary not in published_datasets_set
        and uuid_primary not in handled_set):
        all_recs.append(get_uuid_record_dict(uuid_primary,
                                            note=f"derived dataset {uuid_derived}"))
        handled_set.add(uuid_primary)
    if (uuid_derived not in published_datasets_set
       and uuid_derived not in handled_set):
        all_recs.append(get_uuid_record_dict(uuid_derived,
                                            note=f"parent dataset {uuid_primary}"))
        handled_set.add(uuid_derived)

# This loop covers derived children of published datasets- that is, new versions
for idx, row in qa_derived.iterrows():
    uuid = row['uuid']
    parent_uuid = row['parent_dataset']
    if uuid not in handled_set:
        if parent_uuid in published_datasets_set:
            all_recs.append(get_uuid_record_dict(uuid,
                                                note=f"child of published parent {parent_uuid}"))
            handled_set.add(uuid)

# This loop checks for stragglers
for idx, row in qa_df_short.iterrows():
    uuid = row['uuid']
    if uuid not in handled_set:
        all_recs.append(get_uuid_record_dict(uuid,
                                            note="straggler; " + row['note']))
        handled_set.add(uuid)

all_df = pd.DataFrame(all_recs)
print(f"{len(all_df)} records total")

print('STRAGGLERS Follow:')
straggler_df = all_df[(all_df.note.str.startswith('straggler'))]
display(straggler_df)

## Write the results DataFrame to a file.

In [None]:
all_df.to_csv('/tmp/all_publishable_datasets.tsv', sep='\t')