In [32]:
file = [
'HC0782',
'HC0784',
'HC0785',
'HC018008',
'HC018017',
'HC018107',
'HC018108',
'HC018217',
'HC018218',
'HC018227',
'HC018228',
'HC018258',
'HC018267',
'HC018307',
'HC018308',
'HC018407',
'HC018408',
'HC018417',
'HC018807',
'HC018808',
'HC038008',
'HC038107',
'HC038108',
'HC048008',
'HC048208',
'HC048307',
'HC048308',
'HC128507',
'HC128508',
'HS998508',
'HSC',
'HSC00180',
'HSC00181',
'HSC00182',
'HSC00183',
'HSC00184',
'HSC00188',
'HSC00380',
'HSC00381',
'HSC00417',
'HSC00480',
'HSC00482',
'HSC00780',
'HSC00781',
'HSC00782',
'HSC00784',
'HSC00785',
'HSC00799',
'HSC00881',
'HSC999',
'HSC01285',
'HSC9998B',
'HSC9998C',
'HSC9998D',
'HSC9998E',
'HSC9998F',
'HSC9998H',
'HSC9998I',
'HSC9998K',
'HSC9998L',
'HSC9998M',
'HSC9998N',
'HSC9998O',
'HSC9998P',
'HSC9999',
'HSC0018007',
'HSC0018008',
'HSC0018017',
'HSC0018018',
'HSC0018107',
'HSC0018108',
'HSC0018217',
'HSC0018218',
'HSC0018227',
'HSC0018228',
'HSC0018258',
'HSC0018267',
'HSC0018307',
'HSC0018308',
'HSC0018407',
'HSC0018408',
'HSC0018417',
'HSC0018507',
'HSC0018508',
'HSC0018527',
'HSC0018528',
'HSC0018707',
'HSC0018708',
'HSC0018807',
'HSC0018808',
'HSC0019907',
'HSC0019908',
'HSC0038007',
'HSC0038008',
'HSC0038017',
'HSC0038107',
'HSC0038108',
'HSC0038407',
'HSC0038408',
'HSC0039907',
'HSC0039908',
'HSC0048007',
'HSC0048008',
'HSC0048017',
'HSC0048107',
'HSC0048108',
'HSC0048207',
'HSC0048208',
'HSC0048307',
'HSC0048308',
'HSC0049907',
'HSC0049908',
'HSC0088107',
'HSC0088108',
'HSC0098107',
'HSC99983',
'HSC99985',
'HSC99986',
'HSC99987',
'HSC99989',
'HSC99990',
'HSC99999',
'HSC0108107',
'HSC0118107',
'HSC0128007',
'HSC0128008',
'HSC0128017',
'HSC0128107',
'HSC0128108',
'HSC0128207',
'HSC0128307',
'HSC0128408',
'HSC0128507',
'HSC0128508',
'HSC9998407',
'HSC9998408',
'HSC9998507',
'HSC9998508',
'HSCGOW',
'HSCINV',
'PEN001',
'TRE03999',
'YHSC0018207',
'YHSC0018208',
'ZERO-MRS'
]

In [33]:
def extract_explicit_hierarchy(raw_strings):
    """
    Extract explicit parent-child hierarchy from a list of raw strings.

    Parameters:
    - raw_strings (list): A list of strings representing a hierarchy.

    Returns:
    list: A list of tuples where each tuple contains two elements:
          1. Child string.
          2. Parent string.

    The function iterates through each string in the input list and identifies
    potential ancestors for each string based on the hierarchy. It then selects
    the longest ancestor as the parent, considering that shorter ancestors
    imply higher positions in the hierarchy.

    If no ancestors are found for a string, it is considered its own parent.

    Example:
    >>> raw_strings = ['AB123', 'AB12345', 'CD987', 'CD9876']
    >>> extract_explicit_hierarchy(raw_strings)
    [('AB123', 'AB12345'), ('CD9876', 'CD987'), ('CD987', 'CD987'), ('AB12345', 'AB123')]
    """
    import numpy as np
    # Results list
    node_parent_list = []
    # Iterate all strings
    for c_string in raw_strings:
        # List of potential current ancestors
        c_ancestors = []
        # Iterate all raw strings for ancestor potential
        for candidate_parent in raw_strings:
            # Ignore identical string
            if candidate_parent==c_string: continue
            # Raw string is ancestor if string starts with it, add to list
            if c_string.startswith(candidate_parent): c_ancestors.append(candidate_parent)
        if len(c_ancestors)>0:
            # Select longest ancestor as parent, shorter ancestors imply they are higher up the hierarchy
            c_parent = c_ancestors[np.argmax([len(k) for k in c_ancestors])]
            # Store parent-child tuple
            node_parent_tuple = (c_string,c_parent)
        else:
            # In case no ancestors were found, this string is its own parent
            node_parent_tuple = (c_string,c_string)
        # Add to results list
        node_parent_list.append(node_parent_tuple)
    # Return the result
    return node_parent_list

In [34]:
import pandas as pd

p_file = extract_explicit_hierarchy(file)
df = pd.DataFrame(p_file).rename(columns={0:'Node',1:'Parent'})

In [35]:
df[df['Node']=='HSC00180']

Unnamed: 0,Node,Parent
31,HSC00180,HSC
