In [1]:
# Source code 
from pprint import pprint
import pandas as pd
fp = '/Users/singhn4/Projects/kids_first/data/CBTTC/proteomics/cbttc-proteomics.xlsx'
cbttc_df = pd.read_excel(fp, sheet_name='All Fields - Included - 11_05_2')

from kf_model_omop.factory import scoped_session
from kf_model_omop.model import models
from common import constants as omop_constants
from common.target_api_config import schema

standard_vocab_tables = {
    'concept',
    'concept_ancestor',
    'concept_class',
    'concept_relationship',
    'concept_synonym',
    'vocabulary',
    'drug_strength',
    'relationship',
    'domain',
    'location'
}

def table_coverage(session):
    stats = {'ConditionOccurrence': 'Diagnoses events',
              'Observation': 'Outcome events',
              'Person': 'Participants',
              'ProcedureOccurrence': 'Radiation, Chemotherapy events',
              'Speciman': 'Tumor/normal DNA and RNA samples'
             }
    data = []
    for model_cls_name, entity_type in stats.items():
        model = getattr(models, model_cls_name)
        if not model or (model.__tablename__ in standard_vocab_tables): continue
        rows = session.query(model).count()
        if rows > 0:
            stats[model_cls_name] = (entity_type, rows)
    
        data.append(
            {
                'OMOP Domains': model_cls_name,
                '# of instances': rows,
                'Source Data Domains': entity_type
            }
        )

    df = pd.DataFrame(data)

    return df

def concept_match_coverage(session):
    stats = {}
    output_dfs = []
    for model_cls_name, column_dict in schema.items():
        model = getattr(models, model_cls_name)
        if not model or (model.__tablename__ in standard_vocab_tables): continue
        
        total_records = session.query(model).count()
        if total_records <= 0: continue
                
        if '_links' not in column_dict: continue
            
        stats[model_cls_name] = {}
            
        for column, _ in column_dict['_links'].items():
            if not column.endswith('_concept_id'): continue

            matching_concepts = session.query(model).filter(getattr(model, column) != 0).count()
            
            stats[model_cls_name][column] = matching_concepts
        
        for column, total_matched in stats[model_cls_name].items():
            stats[model_cls_name][column] = round(total_matched/total_records, 3) * 100
        
        df = pd.DataFrame([stats[model_cls_name]], index={model_cls_name + ' (%)'})
        df.rename(columns={col: col.split('_concept_id')[0].replace('_', " ")
                          for col in df.columns}, inplace=True)
        output_dfs.append(df.T)

    return output_dfs

with scoped_session() as session:
    # Table coverage
    domain_df = table_coverage(session)
    concept_match_dfs = concept_match_coverage(session)

  """)


# OMOP Data Model Exploration

## Goals
The initial goal was to research other popular industry data models and determine whether we should adopt one of these as the Kids First data model. After working with OMOP and reading the FHIR spec, it is clear that we cannot just use another data model as is, but rather we should borrow the good concepts, learn from the bad concepts, and discard/modify the bad parts of the Kids First model.

We explored the OMOP Common Data Model through the process of mapping and ingesting a CBTTC (Children's Brain Tumor Tissue Consortium) proteomics dataset. We tried to get a feel for the following:

- How well does the model capture the "things" we care about in Kids First (i.e. participants, pedigree, specimens, diagnoses, etc.) and how those "things" relate to each other
- How well does the model accomodate longitudinal data
- How easy is it to answer questions we care about

## Dataset Description

- 1 spreadsheet of clinical data elements
- Participants with a variety of pediatric brain tumors
- At least 1 sample per participant (variety of normal DNA, tumor DNA, and tumor RNA)
- Includes longitudinal data for diagnoses, radiation, and chemotherapy events

## Data Ingest
-----------------------------------------------------------------------------------------------

### 1. Build OMOP Postgres database, populate with standard vocabularies

- Source code: https://github.com/kids-first/kf-model-omop
   

### 2. Ingest Data

**Extract stage**

- Extract the relevant data from source data tables
- Map the columns to OMOP data table + columns
- Harmonize column values by mapping to standard concept instances

\* Uses the Kids First Data Ingest Library extract stage.

Source code: https://github.com/kids-first/kf-lib-data-ingest

**Transform stage**
- Joined individual tables to create a set of tables that represent a single table per "thing" - 1 table for Person, 1 table for Speciman, 1 table for ProcedureOccurrences, etc.

**Load stage**
- Connect directly to the DB
- Load each "thing" table in indvidually and in proper order (Speciman depends on Person so load Person first, then Speciman)
- Rather than writing raw SQL, use the object oriented data access layer we auto-generated to load the data into the OMOP database

Source code: https://github.com/kids-first/kf-omop-imports


**Mapping Method**

1. Try to automatically map values to OMOP standard concept IDs:
    - Call Athena query API with search term
    - Apply fuzzy string match to Athena results to narrow result list
    - Filter by desired domain, concept valid status, and standard/non-standard status

2. If automated mapping failed or chose wrong result:
    - Manually search and select concept ID via Athena web app
    - Store mapping in cache


## Source Data Coverage

## How easy was it to map and fit source data into the OMOP data model?
---------------------------------------------------------------------------------------------------

### 100% of source domains mapped to OMOP domains

- It was fairly easy to figure out how to map entities in the source data to tables/domains in OMOP
- The hardest part was understanding when to use the various event types, and how to best store continuous time events (radiation therapy) vs point in time events (started radiation therapy)

In [2]:
# Display stats df
display(domain_df)

Unnamed: 0,# of instances,OMOP Domains,Source Data Domains
0,230,ConditionOccurrence,Diagnoses events
1,204,Observation,Outcome events
2,204,Person,Participants
3,112,ProcedureOccurrence,"Radiation, Chemotherapy events"
4,651,Speciman,Tumor/normal DNA and RNA samples


### Roughly 90% of source data columns map to OMOP concepts

Radiation Events 
- Represented as procedure occurrence events (but maybe should have been drug exposure events?)
- `radiation_site` column did not map cleanly into procedure occurrence. Radiation site represents the anatomical location where radiation was applied. This had to be stuffed into the `procedure_occurrence_source_value` column to capture it.

Specimens
- `tumor_resection_extent` did not map to any columns on Speciman table

See https://github.com/kids-first/kf-omop-imports/tree/master/cbttc_proteomics/extract_configs for exact mappings of columns to standard concepts


### What % of values in source data map to OMOP standard concept instances

    Number of values in a column that matched to a concept_id != 0 (No Match) / total values in column

In [3]:
# Display stats dfs
for df in concept_match_dfs:
    display(df)

Unnamed: 0,ConditionOccurrence (%)
condition,99.6
condition source,100.0
condition status,98.7
condition type,100.0


Unnamed: 0,Observation (%)
obs event field,100.0
observation,100.0
observation source,100.0
observation type,100.0
qualifier,0.0
unit,0.0
value as,0.0


Unnamed: 0,Person (%)
ethnicity,99.0
ethnicity source,100.0
gender,100.0
gender source,100.0
race,87.3
race source,100.0


Unnamed: 0,ProcedureOccurrence (%)
modifier,100.0
procedure,95.5
procedure source,100.0
procedure type,100.0


Unnamed: 0,Speciman (%)
anatomic site,99.5
disease status,100.0
specimen,100.0
specimen type,100.0
unit,0.0
