# Preparing metadata for submission to the DMRR

In [1]:
study_name = 'EXR-MTEWA1HealthyControls'

## Load the samples spreadsheet

In [2]:
import pandas as pd
import numpy as np
samples_df = pd.read_csv('sample_sheet.csv')
for column in ['Gender', 'Race', 'Source']:  # capitalize and strip whitespace for consistency
    samples_df[column] = samples_df[column].str.capitalize()
    samples_df[column] = samples_df[column].str.strip()
# only use healthy control study and samples that passed quality control
samples_df = samples_df.loc[(samples_df['Study'] == 'Healthy Controls') & (samples_df['MISEQ.QC.PASS'] == 'PASS')]
# keep only the necessary columns
samples_df = samples_df[['Participant.ID', 'MT.Unique.ID', 'Age', 'Gender', 'Race', 'Source']]
samples_df = samples_df.set_index(['MT.Unique.ID']).sort_values(by='Participant.ID')
samples_df[:5]

Unnamed: 0_level_0,Participant.ID,Age,Gender,Race,Source
MT.Unique.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,70014,23.0,Male,Asian,Plasma
107,70014,23.0,Male,Asian,Serum
2,70016,39.0,Male,White,Plasma
108,70016,39.0,Male,White,Serum
3,70028,33.0,Female,White,Plasma


## Load the participant info

In [3]:
# remove duplicate participants -- all have plasma but some additionally have serum
participants = samples_df.loc[(samples_df['Source'] == 'Plasma')]
participants = participants[['Participant.ID', 'Age', 'Race', 'Gender']].set_index('Participant.ID')
participants[:5]

Unnamed: 0_level_0,Age,Race,Gender
Participant.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
70014,23.0,Asian,Male
70016,39.0,White,Male
70028,33.0,White,Female
70029,27.0,Black or african american,Female
70038,22.0,White,Female


## Use correct ontology terms

In [4]:
race_ontology = {'Asian': 'Asian',
 'Black or african american': 'African American',
 'Mixed/asian & white': 'Multiracial',
 'Mixed/asian &black': 'Multiracial',
 'Mixed/black, white, asian': 'Multiracial',
 'Native hawiian or other pacific islander': 'Native Hawaiian or Other Pacific Islander',
 'Pacific islander': 'Native Hawaiian or Other Pacific Islander',
 'White': 'White'}
for part_id in participants.index:
    race = participants.at[part_id, 'Race']
    participants.at[part_id, 'Race'] = race_ontology[race] if race in race_ontology else 'Multiracial'
set(participants['Race'])

{'African American',
 'Asian',
 'Multiracial',
 'Native Hawaiian or Other Pacific Islander',
 'White'}

## Load the donors template

In [5]:
import pandas as pd
donors = pd.read_csv('templates/Donors.template.tsv', sep='\t')
donors = donors.set_index('#property')
donors.drop(['- Ethnic Group', '-- Current Health Status', '-- Medical History', '-- Smoking History', '-- Medications','-- Treatment History', 
             '-- Family History', '-- Treatment History', '-- Family History', '-- Developmental Stage', '- Has Expired?', '-- Estimated Date',
             '-- Post-mortem Interval', '- Notes', '* Family Members', '*- Family Member', '*-- Relationship', '*-- DocURL', '* Aliases','*-  Accession',
             '*-- dbName', '*-- URL', '- Health Status', '*-- Notes'], inplace=True)
donors.head()

Unnamed: 0_level_0,value,domain,default,required,description
#property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Donor,,"autoID(EXR, uniqAlphaNum, DO)",,,Document Describing Information About the Dono...
- Status,,"enum(Add, Modify, Hold, Cancel, Suppress, Rele...",Add,True,Status of the document
- Sex,,"bioportalTerms((SNOMEDCT,http://purl.bioontolo...",,True,"Gender of sample donor (Example: Male, Female,..."
- Racial Category,,bioportalTerm(http://data.bioontology.org/sear...,,,The racial category of the donor
- Donor Type,,"enum(Experimental, Control, Healthy Subject)",,True,Sample type (experimental sample or control sa...


## Fill in the donors dataframe

In [6]:
i = 1
for part_id in participants.index:
    participant_column = 'value' + part_id
    donor_id = study_name + str(i) + '-DO'
    participants.loc[part_id, 'donor.id'] = donor_id
    donors.insert(i, participant_column, donors['value'])
    donors.loc['Donor', participant_column] = donor_id  # for matching biosamples to donor ids
    donors.loc['- Status', participant_column] = 'Add'
    donors.loc['- Sex', participant_column] = participants.at[part_id, 'Gender']
    donors.loc['- Racial Category', participant_column] = participants.at[part_id, 'Race']
    donors.loc['- Donor Type', participant_column] = 'Healthy Subject'
    donors.loc['- Age', participant_column] = str(participants.at[part_id, 'Age']) + ' years'
    donors.loc['* Custom Metadata', participant_column] = 1
    donors.loc['*- Property Name', participant_column] = 'Participant.ID'
    donors.loc['*-- Value', participant_column] = part_id
    i += 1
donors = donors.drop('value', axis=1)
donors.iloc[:5,:3]

Unnamed: 0_level_0,value70014,value70016,value70028
#property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Donor,EXR-MTEWA1HealthyControls1-DO,EXR-MTEWA1HealthyControls2-DO,EXR-MTEWA1HealthyControls3-DO
- Status,Add,Add,Add
- Sex,Male,Male,Female
- Racial Category,Asian,White,White
- Donor Type,Healthy Subject,Healthy Subject,Healthy Subject


In [7]:
participants.iloc[:5, :5]  # the participants dataframe now has the donor ids

Unnamed: 0_level_0,Age,Race,Gender,donor.id
Participant.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
70014,23.0,Asian,Male,EXR-MTEWA1HealthyControls1-DO
70016,39.0,White,Male,EXR-MTEWA1HealthyControls2-DO
70028,33.0,White,Female,EXR-MTEWA1HealthyControls3-DO
70029,27.0,African American,Female,EXR-MTEWA1HealthyControls4-DO
70038,22.0,White,Female,EXR-MTEWA1HealthyControls5-DO


## Load the biosamples template

In [8]:
biosamples = pd.read_csv('templates/Biosamples.template.tsv', sep='\t')
biosamples = biosamples.set_index('#property')
biosamples = biosamples.drop(['-- Age at Sampling', '-- Notes', '- Description', '--- Symptoms', '--- Pathology', '--- Disease Duration', 
       '--- Collection Details',
       '---- Sample Collection Method', '---- Geographic Location',
       '---- Collection Date', '---- Time of Collection',
       '---- Collection Tube Type', '----- Other Collection Tube Type',
       '---- Holding Time', '---- Holding Temperature',
       '---- Preservatives Used', '---- Freezing Method',
       '---- Number of Times Freeze Thawed',
       '---- Contamination Removal Method', '--- Notes',
       '-- Cell Culture Supernatant', '--- Source', '---- Type',
       '---- Cell Line', '---- Start Date', '---- Harvest Date', '--- Tissue',
       '---- Date Obtained', '---- Tissue Type', '--- Notes',
       '-- Starting Amount', '-- Replicate Information',
       '--- Biological Replicate Number', '--- Technical Replicate Number', '-- Provider','--- Company Name', '--- Lab Name', '--- Person Name',
       '* Pooled Biosamples', '*- Pooled Biosample', '*-- DocURL', '* Aliases', '*-  Accession', '*-- dbName', '*-- URL', 
       '*-- Date Submitted to External Database', '*-- Notes'])

biosamples = biosamples.T
biosamples.insert(18, '*-- DocURL', [np.nan, 'URL', np.nan, np.nan, 'Relative ID (accession) of doc, provide Document URL'])
biosamples = biosamples.T
biosamples.iloc[:5, :5]

Unnamed: 0_level_0,value,domain,default,required,description
#property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biosample,,"autoID(EXR, uniqAlphaNum, BS)",,,Document Describing Information About the Bios...
- Status,,"enum(Add, Modify, Hold, Cancel, Suppress, Rele...",Add,True,Status of the document
- Name,,string,,True,Name of the sample
- Donor ID,,"regexp(EXR-[a-zA-Z0-9]{6,}-DO)",,True,ID of related donor document
-- DocURL,,url,,,"Relative ID (accession) of Donor ID doc, provi..."


## Fill in the biosamples dataframe

In [9]:
i = 1
for mt_unique_id in samples_df.index:
    donor_id = participants.loc[samples_df.loc[mt_unique_id, 'Participant.ID'], 'donor.id']
    sample_column = 'value' + str(mt_unique_id)
    biosamples.insert(i, sample_column, biosamples['value'])
    biosamples.loc['Biosample', sample_column] = study_name + str(i) + '-BS'
    biosamples.loc['- Status', sample_column] = 'Add'
    biosamples.loc['- Name', sample_column] = 'MT.Unique.ID_' + str(mt_unique_id)
    biosamples.loc['- Donor ID', sample_column] = donor_id
    biosamples.loc['-- DocURL', sample_column] = 'coll/Donors/doc/' + donor_id
    biosamples.loc['--- Scientific Name', sample_column] = 'Homo sapiens'
    biosamples.loc['--- Common Name', sample_column] = 'Human'
    biosamples.loc['--- Taxon ID', sample_column] = 9606
    biosamples.loc['-- Disease Type', sample_column] = 'Healthy Subject'
    biosamples.loc['-- Anatomical Location', sample_column] = 'Plasma cell'
    biosamples.loc['--- Biofluid Name', sample_column] = samples_df.loc[mt_unique_id, 'Source']
    biosamples.loc['-- exRNA Source', sample_column] = ' total cell-free biofluid RNA'
    biosamples.loc['-- Fractionation', sample_column] = 'Yes'
    biosamples.loc['* Related Experiments', sample_column] = 1
    biosamples.loc['*- Related Experiment', sample_column] = study_name + '1-EX'
    biosamples.loc['*-- DocURL', sample_column] = 'coll/Experiments/doc/' + study_name + '1-EX'
    biosamples.loc['* Custom Metadata', sample_column] = 1
    biosamples.loc['*- Property Name', sample_column] = 'Participant.ID'
    biosamples.loc['*-- Value', sample_column] = samples_df.loc[mt_unique_id, 'Participant.ID']
    i += 1
    
biosamples = biosamples.drop('value', axis=1)
biosamples.iloc[:5, :5]

Unnamed: 0_level_0,value1,value107,value2,value108,value3
#property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biosample,EXR-MTEWA1HealthyControls1-BS,EXR-MTEWA1HealthyControls2-BS,EXR-MTEWA1HealthyControls3-BS,EXR-MTEWA1HealthyControls4-BS,EXR-MTEWA1HealthyControls5-BS
- Status,Add,Add,Add,Add,Add
- Name,MT.Unique.ID_1,MT.Unique.ID_107,MT.Unique.ID_2,MT.Unique.ID_108,MT.Unique.ID_3
- Donor ID,EXR-MTEWA1HealthyControls1-DO,EXR-MTEWA1HealthyControls1-DO,EXR-MTEWA1HealthyControls2-DO,EXR-MTEWA1HealthyControls2-DO,EXR-MTEWA1HealthyControls3-DO
-- DocURL,coll/Donors/doc/EXR-MTEWA1HealthyControls1-DO,coll/Donors/doc/EXR-MTEWA1HealthyControls1-DO,coll/Donors/doc/EXR-MTEWA1HealthyControls2-DO,coll/Donors/doc/EXR-MTEWA1HealthyControls2-DO,coll/Donors/doc/EXR-MTEWA1HealthyControls3-DO


## Strip numbers from the "value" columns

In [10]:
for df in [donors, biosamples]:
    column_names = [''.join(l for l in col if not l.isdigit() and l != '*') for col in list(df.columns)]
    df.columns = column_names

## Load the manifest template file

In [11]:
import json
with open('templates/manifest_template.manifest.json', 'r') as file:
    manifest = json.load(file)
manifest

{'studyName': '',
 'userLogin': '',
 'md5CheckSum': '',
 'runMetadataFileName': '',
 'submissionMetadataFileName': '',
 'studyMetadataFileName': '',
 'experimentMetadataFileName': '',
 'biosampleMetadataFileName': '',
 'donorMetadataFileName': '',
 'manifest': [{'sampleName': '', 'dataFileName': ''}],
 'settings': {'analysisName': ''}}

## Load the list of fastq filenames

In [12]:
with open('HealthyControl_and_FeedingStudy_fastq_file.names.txt', 'r') as file:
    sample_filenames = {int(f.split('_')[0]): f.strip() for f in file}  # { MT.Unique.ID: fastq_filename }

## Fill in the manifest

In [13]:
import datetime
import time

manifest['settings']['analysisName'] = 'MTEWA1_Healthy_Controls_' + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')

manifest['studyName'] = "U01 Healthy Controls July 2018"
manifest['userLogin'] = 'sovacool'
manifest['group'] = 'exrna-mtewa1'
manifest['db'] = 'hg19_exRNA'
manifest['runMetadataFileName'] = study_name + '-RU.metadata.tsv'
manifest['submissionMetadataFileName'] = study_name + '-SU.metadata.tsv'
manifest['studyMetadataFileName'] = study_name + '-ST.metadata.tsv'
manifest['experimentMetadataFileName'] = study_name + '-EX.metadata.tsv'
manifest['biosampleMetadataFileName'] = study_name + '-BS.metadata.tsv'
manifest['donorMetadataFileName'] = study_name + '-DO.metadata.tsv'

manifest['manifest'] = list()
for mt_unique_id in sorted(samples_df.index):
    fastq_filename = sample_filenames[mt_unique_id]
    sample_name = 'MT.Unique.ID_' + str(mt_unique_id)
    manifest['manifest'].append({'sampleName': sample_name, 'dataFileName': fastq_filename})
len(manifest['manifest'])

130

## Save all the files

In [14]:
import json

donors.to_csv('healthyCtrl/' + manifest['donorMetadataFileName'],  sep='\t')
biosamples.to_csv('healthyCtrl/' + manifest['biosampleMetadataFileName'],  sep='\t')

with open(study_name + '.manifest.json', 'w') as file:
    json.dump(manifest, file, indent=4)

## Validate that the files listed in the manifest exist

In [15]:
import os

for key in manifest:
    if 'FileName' in key:
        assert os.path.isfile('healthyCtrl/' + manifest[key])

## Don't forget to manually fill in experiment, run, study, and submission metadata files!