In [11]:
import os
import pandas as pd

project_path = os.getcwd()
ingest_path = os.path.join(project_path,'ingested')
ingest_path

'/Users/khanwn/Repos/c2m2-submission-process/data_ingestion/ingested'

# Missing Experiment Strategy Characteristics
1. Only visible rows for both tables
2. Left join between tables

# Synopsis of Result
Genomic file related to sequencing experiments that are both simulataneously visible and not visible caused the disparity in experiment strategies for the genomic file using left join. Inner join eliminates the additional sequencing experiment record with experiment strategy omitted.


In [25]:
from table_ops import TableJoiner

seq_exp_gf_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment_genomic_file.csv')).query('visible == True') 
seq_exp_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment.csv')).query('visible == True')

joined_df = TableJoiner(seq_exp_gf_df) \
            .left_join(seq_exp_df,
                           left_key='SG_sequencing_experiment_id',
                           right_key='SE_kf_id') \
            .get_result()

joined_df.query('SG_genomic_file_id == "GF_HA17AHM3"') \
[['SG_genomic_file_id','SG_kf_id','SE_kf_id','SE_experiment_strategy','SE_visible']]

Unnamed: 0,SG_genomic_file_id,SG_kf_id,SE_kf_id,SE_experiment_strategy,SE_visible
2679,GF_HA17AHM3,SG_FYECN825,SE_2VGRF0JK,WGS,True
2763,GF_HA17AHM3,SG_01KS30TK,,,


The code snippet below shows the elimination of the additional row for the sequencing genomic file containing the hidden sequencing experiment by using an inner join.

Also, it is worth nothing that a left join was done between the genomic_file and sequencing_genomic_file tables to not lose genomic files. However, a left join between the sequencing_genomic_file and sequencing experiment tables is not necessary because the only consequence would be the loss of an experiment strategy.

In [49]:
from table_ops import TableJoiner

seq_exp_gf_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment_genomic_file.csv')).query('visible == True') 
seq_exp_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment.csv')).query('visible == True')

joined_df = TableJoiner(seq_exp_gf_df) \
            .join_kf_table(seq_exp_df,
                           left_key='SG_sequencing_experiment_id',
                           right_key='SE_kf_id') \
            .get_result()

joined_df.query('SG_genomic_file_id == "GF_HA17AHM3"') \
[['SG_genomic_file_id','SG_kf_id','SE_kf_id','SE_experiment_strategy']]

Unnamed: 0,SG_genomic_file_id,SG_kf_id,SE_kf_id,SE_experiment_strategy
22135,GF_HA17AHM3,SG_FYECN825,SE_2VGRF0JK,WGS


# Missing Biosample Disease Mappings
DOID not present for certain biosamples.
# Synopsis of Result
Some study id's do not have an associated disease mapping.

In [2]:
import os
import pandas as pd
from table_ops import TableJoiner

project_path = os.getcwd()
ingest_path = os.path.join(project_path,'ingested')

In [3]:
kf_participant_df = pd.read_csv(os.path.join(ingest_path,'participant.csv')).query('visible == True')
biospec_df = pd.read_csv(os.path.join(ingest_path,'biospecimen.csv')).query('visible == True')
disease_mapping_df = pd.read_csv(os.path.join(project_path,'conversion_tables','project_disease_matrix_only.csv'))
studies_df = pd.read_table(os.path.join(project_path,'studies_on_portal.txt'))

kf_biospecs = TableJoiner(kf_participant_df) \
                .join_kf_table(disease_mapping_df,
                               left_key='PT_study_id',
                               right_key='study_id') \
                .join_kf_table(studies_df,
                               left_key='PT_study_id',
                               right_key='studies_on_portal') \
                .join_kf_table(biospec_df,
                               left_key='PT_kf_id',
                               right_key='BS_participant_id') \
                .get_result()

kf_biospecs.query('BS_kf_id == "BS_00Z52JT7"')[['PT_study_id','BS_kf_id','DOID']]

  biospec_df = pd.read_csv(os.path.join(ingest_path,'biospecimen.csv')).query('visible == True')


Unnamed: 0,PT_study_id,BS_kf_id,DOID
106325,SD_DZ4GPQX6,BS_00Z52JT7,


# Missing Subject Disease mapping

Same issue with missing biosample disease mapping

In [4]:
subject_disease_df = TableJoiner(kf_participant_df) \
                        .join_kf_table(disease_mapping_df,
                                       left_key='PT_study_id',
                                       right_key='study_id') \
                        .get_result()

kf_biospecs.query('PT_kf_id == "PT_05Y7NVK2"')[['PT_study_id','BS_kf_id','DOID']]

Unnamed: 0,PT_study_id,BS_kf_id,DOID
106248,SD_DZ4GPQX6,BS_V63P5DMB,


# Missing Genomic Files
No idea at this point

In [5]:
kf_participant_df = pd.read_csv(os.path.join(ingest_path,'participant.csv')).query('visible == True')
studies_df = pd.read_table(os.path.join(project_path,'studies_on_portal.txt'))

kf_parts = TableJoiner(kf_participant_df) \
        .join_kf_table(studies_df,
                       left_key='PT_study_id',
                       right_key='studies_on_portal') \
        .get_result()


The cell above is gathering participants from studies on portal

The cell below is acuiring genomic file data.

In [47]:
biospec_df = pd.read_csv(os.path.join(ingest_path,'biospecimen.csv'),low_memory=False).query('visible == True')
biospec_genomic_df = pd.read_csv(os.path.join(ingest_path,'biospecimen_genomic_file.csv'),low_memory=False).query('visible == True')
genomic_file_df = pd.read_csv(os.path.join(ingest_path,'genomic_file.csv'),low_memory=False).query('visible == True')

genomic_file_df = TableJoiner(kf_parts) \
                .join_kf_table(biospec_df,
                               left_key='PT_kf_id',
                               right_key='BS_participant_id') \
                .join_kf_table(biospec_genomic_df,
                               left_key='BS_kf_id',
                               right_key='BG_biospecimen_id') \
                .join_kf_table(genomic_file_df,
                               left_key='BG_genomic_file_id',
                               right_key='GF_kf_id') \
                                .get_result()

genomic_file_df = genomic_file_df.query('GF_kf_id == "GF_02R9K485"')
#genomic_file_df[[col for col in genomic_file_df.columns if isinstance(col,str) and col.startswith('BS')]]
genomic_file_df[['PT_study_id','PT_kf_id','BS_kf_id','GF_kf_id','GF_data_type']]

Unnamed: 0,PT_study_id,PT_kf_id,BS_kf_id,GF_kf_id,GF_data_type
171418,SD_DYPMEHHF,PT_M8RHAK5K,BS_Z862V8M7,GF_02R9K485,Raw Somatic Structural Variation Index
171419,SD_DYPMEHHF,PT_M8RHAK5K,BS_WGYG91X8,GF_02R9K485,Raw Somatic Structural Variation Index


The cell below is joining the genomic file to the sequencing experiment files.

In [41]:
seq_exp_gf_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment_genomic_file.csv')).query('visible == True') 
seq_exp_df = pd.read_csv(os.path.join(ingest_path,'sequencing_experiment.csv')).query('visible == True')

with_seq_df = TableJoiner(genomic_file_df) \
            .left_join(seq_exp_gf_df,
                           left_key='GF_kf_id',
                           right_key='SG_genomic_file_id') \
            .join_kf_table(seq_exp_df,
                           left_key='SG_sequencing_experiment_id',
                           right_key='SE_kf_id') \
            .get_result()

with_seq_df[['GF_kf_id','SG_kf_id','SE_kf_id','GF_latest_did','BS_source_text_tissue_type','SE_experiment_strategy']].drop_duplicates()

Unnamed: 0,GF_kf_id,SG_kf_id,SE_kf_id,GF_latest_did,BS_source_text_tissue_type,SE_experiment_strategy
0,GF_02R9K485,SG_09WY9HHM,SE_AWYNZQN7,2f88b198-10b8-40e0-8edc-f60329999695,Tumor,WXS
1,GF_02R9K485,SG_09WY9HHM,SE_AWYNZQN7,2f88b198-10b8-40e0-8edc-f60329999695,Normal,WXS
2,GF_02R9K485,SG_A6HEMSPD,SE_E91RDF3H,2f88b198-10b8-40e0-8edc-f60329999695,Tumor,WGS
3,GF_02R9K485,SG_A6HEMSPD,SE_E91RDF3H,2f88b198-10b8-40e0-8edc-f60329999695,Normal,WGS
4,GF_02R9K485,SG_NJZHTJ3G,SE_57SP41D5,2f88b198-10b8-40e0-8edc-f60329999695,Tumor,WGS
5,GF_02R9K485,SG_NJZHTJ3G,SE_57SP41D5,2f88b198-10b8-40e0-8edc-f60329999695,Normal,WGS


In [13]:
indexd_df = pd.read_csv(os.path.join(ingest_path,'indexd_scrape.csv'),low_memory=False)
hashes_df = pd.read_csv(os.path.join(ingest_path,'hashes_old.csv'),low_memory=False)
aws_scrape_df = pd.read_csv(os.path.join(ingest_path,'aws_scrape.csv'),low_memory=False)

In [14]:
metadata_df = TableJoiner(indexd_df) \
            .join_kf_table(hashes_df,
                           left_key='url',
                           right_key='s3path') \
            .join_kf_table(aws_scrape_df,
                           left_key='url',
                           right_key='s3path') \
            .get_result()

In [46]:
from cfde_convert import kf_to_cfde_subject_value_converter

with_metadata_df = TableJoiner(with_seq_df) \
                .left_join(metadata_df,
                           left_key='GF_latest_did',
                           right_key='did') \
                .get_result()

with_metadata_df = kf_to_cfde_subject_value_converter(with_metadata_df,'SE_experiment_strategy')
#with_metadata_df[['PT_kf_id','BS_kf_id','GF_kf_id','SE_experiment_strategy','filename']].drop_duplicates()
with_metadata_df[['GF_kf_id','file_name']]

Unnamed: 0,GF_kf_id,file_name
0,GF_02R9K485,
1,GF_02R9K485,
2,GF_02R9K485,
3,GF_02R9K485,
4,GF_02R9K485,
5,GF_02R9K485,


# Additional Issues
## file.tsv
1. Checksum not acquired for some files for which drs uri's were computed
2. Multiple experiment strategies per genomic file
## file_describes_biosample.tsv
1. biosamples with missing genomic files
## disease tables
1. Inquire into ontology for study missing disease identifier