In [1]:
import pandas as pd
from etl.transform.standard_model.concept_schema import (
    CONCEPT,
    DELIMITER,
    UNIQUE_ID_ATTR,
    concept_from,
    concept_attr_from,
    unique_key_composition
)
from common.constants import *

In [5]:

def _unique_key_cols(concept_name, df, ukey_cols):
    """
    Compose the list of column names that are needed to build a unique key
    for a particular concept.

    The required columns for a concept's unique key are defined in
    etl.transform.standard_model.concept_schema.unique_key_composition.

    A concept's unique key can be composed of other concept's unique keys.
    This is a recursive method that collects the required columns needed to
    build a unique key column for a concept. If one of the required columns
    is a unique key itself, then the method will recurse in order to get
    the columns that make up that unique key.

    :param concept_name: a string and the name of the concept for which a
    unique key will be made
    :param df: a Pandas DataFrame
    :param ukey_cols: the output list of columns needed to build the unique
    key column for a concept.
    """
    # If unique key col for this concept already exists return that
    ukey_col_name = f'{concept_name}{DELIMITER}{UNIQUE_ID_ATTR}'
    if ukey_col_name in df.columns:
        ukey_cols.append(ukey_col_name)
        return

    # Get the required cols needed to make a unique key for this concept
    required_cols = unique_key_composition.get(concept_name)
    # If required cols don't exist for a concept, then we have made a dev
    # error in concept_schema.py
    assert required_cols, (f'Concept {concept_name} does not have '
                           'a unique key composition defined '
                           'in the concept schema!')
    cols = set(df.columns)
    
    # Add required cols to cols needed for the unique key
    for req_col in required_cols:
        if concept_attr_from(req_col) == UNIQUE_ID_ATTR:
            # The required col is a unique key itself, so recurse
            _unique_key_cols(concept_from(req_col), df, ukey_cols)
        else:
            # If all of the required cols are not present then we cannot make the
            # unique key
            assert req_col in cols, (f'Failed to create unique key for {concept_name}.'
                                     ' Missing 1 or more required columns')
            ukey_cols.append(req_col)


def insert_keys(df):
    for concept_name in unique_key_composition.keys():
        # Determine the cols needed to compose a unique key for the concept
        ukey_cols = []
        try:
            _unique_key_cols(concept_name, df, ukey_cols)
        except AssertionError as e:
            pass
        else:
            unique_key_col = f'{concept_name}{DELIMITER}{UNIQUE_ID_ATTR}'
            df[unique_key_col] = df.apply(
                lambda row: DELIMITER.join([str(row[c])
                                            for c in ukey_cols]), axis=1)
    return df

In [6]:
df = pd.DataFrame([{CONCEPT.PARTICIPANT.ID: 'P1',
                        CONCEPT.BIOSPECIMEN.ID: 'B1',
                        CONCEPT.GENOMIC_FILE.ID: 'G1'},
                       {CONCEPT.PARTICIPANT.ID: 'P1',
                        CONCEPT.BIOSPECIMEN.ID: 'B2',
                        CONCEPT.GENOMIC_FILE.ID: 'G1'},
                       {CONCEPT.PARTICIPANT.ID: 'P2',
                        CONCEPT.BIOSPECIMEN.ID: 'B3',
                        CONCEPT.GENOMIC_FILE.ID: 'G1'}])
display(df)
df = insert_keys(df)
display(df)

Unnamed: 0,CONCEPT|BIOSPECIMEN|ID,CONCEPT|GENOMIC_FILE|ID,CONCEPT|PARTICIPANT|ID
0,B1,G1,P1
1,B2,G1,P1
2,B3,G1,P2


Unnamed: 0,CONCEPT|BIOSPECIMEN|ID,CONCEPT|GENOMIC_FILE|ID,CONCEPT|PARTICIPANT|ID,CONCEPT|GENOMIC_FILE|UNIQUE_KEY,CONCEPT|BIOSPECIMEN|UNIQUE_KEY,CONCEPT|PARTICIPANT|UNIQUE_KEY,CONCEPT|BIOSPECIMEN_GENOMIC_FILE|UNIQUE_KEY
0,B1,G1,P1,G1,B1,P1,B1|G1
1,B2,G1,P1,G1,B2,P1,B2|G1
2,B3,G1,P2,G1,B3,P2,B3|G1


In [7]:
concept_name = 'CONCEPT|BIOSPECIMEN_GENOMIC_FILE'
ukey_col = f'{concept_name}{DELIMITER}{UNIQUE_ID_ATTR}'
id_col = f'{concept_name}{DELIMITER}ID'
assert ukey_col in df.columns

if concept_name == CONCEPT.BIOSPECIMEN_GENOMIC_FILE._CONCEPT_NAME:
    col1 = CONCEPT.BIOSPECIMEN.UNIQUE_KEY
    col2 = CONCEPT.GENOMIC_FILE.UNIQUE_KEY
    s = df.apply(
        lambda row: (row[ukey_col].split(DELIMITER)[0] == row[col1])
        and (row[ukey_col].split(DELIMITER)[1] == row[col2]), axis=1
    )

In [8]:
s

0    True
1    True
2    True
dtype: bool