In [53]:
import pandas as pd
import boto3
import os

# Parse Excel Exomes
meta = pd.read_excel("Metadata.xlsx")

# Parse Excel PDXs
metapdxs = pd.read_excel('Metadata_pdx.xlsx')

In [54]:
# Convert gender to XX or XY
meta.loc[meta["Gender"] == "female", "Gender"] = "XX"
meta.loc[meta["Gender"] == "male", "Gender"] = "XY"

# Convert sample id to string
meta['Sample_ID'] = meta['Sample_ID'].astype(str)

# Remove dots from sample names
meta['Sample_ID'] = meta['Sample_ID'].str.replace('.', '')

# Make sample id unique
meta['Sample_ID'] = ['{}_{}_{}'.format(x, i, meta.iloc[i,:]['sampletype']) for i,x in enumerate(meta['Sample_ID'])]

# Create lane column
meta['lane'] = 1

# Check for re-sequenced sampless
rows_to_add = list()
for index, row in meta.iterrows():
    if not pd.isnull(row['FASTA ID (re-secuenciado) ']):
        new_row = row.copy()
        new_row['FASTA ID'] = row['FASTA ID (re-secuenciado) ']
        new_row['lane'] = 2
        rows_to_add.append(new_row)
for r in rows_to_add:
    meta = meta.append(r, ignore_index=True)


# Check the files are in S3 and adjust incorrect names if they are not found
filenames = set()
s3 = boto3.client('s3')
for obj in s3.list_objects_v2(Bucket='scitron', Prefix='exomes/')['Contents'][1:]:
    filenames.add(os.path.basename(obj['Key']))
for index, row in meta.iterrows():
    if not pd.isnull(row['FASTA ID']) and row['FASTA ID'] != '':
        file_id = str(row['FASTA ID']).strip()
        meta.loc[index, 'FASTA ID'] = 's3://scitron/exomes/{}_1.fastq.gz'.format(file_id)
        meta.loc[index, 'FASTA2'] = 's3://scitron/exomes/{}_2.fastq.gz'.format(file_id)
        file1 = os.path.basename(meta.loc[index, 'FASTA ID'])
        file2 = os.path.basename(meta.loc[index, 'FASTA2'])
        if file1 not in filenames or file2 not in filenames:
            meta.loc[index, 'FASTA ID'] = meta.loc[index, 'FASTA ID'].replace('_1.fastq.gz', '_R1_001.fastq.gz')
            meta.loc[index, 'FASTA2'] = meta.loc[index, 'FASTA2'].replace('_2.fastq.gz', '_R2_001.fastq.gz')

# Add PDXs information
for index, row in metapdxs.iterrows():
    file_id = row['FastaID'].replace('/', '-')
    patiend_id = row['Pt VHIO ID']
    sample_type = row['Type of sample']
    sample_id = "{}_{}".format(row['PDX_UBOP_ID'], sample_type)
    lane = 1
    for file in filenames:
        if 'R2' not in file and file_id in file:
            new_row = meta.loc[meta["Patient ID"] == patiend_id].iloc[0].copy()
            new_row['lane'] = lane
            lane += 1
            new_row['sampletype'] = sample_type
            new_row['Sample_ID'] = sample_id
            new_row['FASTA ID'] = 's3://scitron/exomes/{}'.format(file)
            new_row['FASTA2'] = 's3://scitron/exomes/{}'.format(file.replace('R1', 'R2'))
            meta = meta.append(new_row, ignore_index=True)

# Discard Missing samples
meta = meta.loc[meta['FASTA ID'].notnull()]

In [55]:
# Final check 
filenames = set()
files1 = set()
files2 = set()
s3 = boto3.client('s3')
for obj in s3.list_objects_v2(Bucket='scitron', Prefix='exomes/')['Contents'][1:]:
    filenames.add(os.path.basename(obj['Key']))
for index, row in meta.iterrows():
    file1 = os.path.basename(str(row['FASTA ID']))
    files1.add(file1)
    file2 = os.path.basename(str(row['FASTA2']))
    files2.add(file2)
    if file1 not in filenames or file2 not in filenames:
        print("Row {} does not have the correct file names {} {}".format(index, file1, file2))

for diff in sorted((filenames - (files1 | files2))):
    print(diff)

md5sum.txt


In [57]:
# Sarek format
sub_meta = meta.loc[:,["Patient ID", "Gender", "sampletype", "Sample_ID", "lane", "FASTA ID", "FASTA2"]]

# Convert sample type to 0 (normal) and 1 (tumor)
sub_meta.loc[sub_meta["sampletype"] == "gDNA", "sampletype"] = 0
sub_meta.loc[sub_meta["sampletype"] != 0, "sampletype"] = 1

# Test dataset
sub_meta.loc[sub_meta["Patient ID"] == "VHIO_13"].to_csv("testsamples.tsv", 
                                                         sep="\t", 
                                                         header=None, 
                                                         index=None)
# All samples
sub_meta.to_csv("samples.tsv", sep="\t", header=None, index=None)