In [9]:
from pathlib import Path
import pandas as pd
from itertools import chain

PATH = '../Results/VariantCalling/'

metasamples = pd.read_csv('../sarek/samples_all.tsv', sep="\t", header=None)
metasamples.columns = ['PATIENT_ID', 'GENDER', "TUMOR", "SAMPLE_ID", "LANE", "FASTQ1", "FASTQ2"]
metasamples = metasamples.drop_duplicates(subset=['SAMPLE_ID'])
metasamples.index = [x.strip() for x in metasamples['SAMPLE_ID'] + '-' + metasamples['PATIENT_ID']]

metadata = pd.read_excel('../Metadata.xlsx')
# Keep only samples that are tumor-normal pair and labeled as included
df = metadata.loc[(metadata['INCLUDED'] == "Y")
                  & (metadata['TUMOR_ONLY'] == "No")
                  & (metadata['SAMPLE_TYPE'] != "gDNA"),
                  ['PATIENT_ID', 'SAMPLE_ID']]
metadata.index = metadata['SAMPLE_ID']
meta_filtered = metadata.reindex(df['SAMPLE_ID'].tolist())
samples = [x.strip() for x in df['SAMPLE_ID'] + '-' + df['PATIENT_ID']]
meta_filtered.index = samples

metasamples = metasamples.reindex(samples)

print('Loaded {} samples'.format(metasamples.shape[0]))
print('Searching for variants in {}'.format(PATH))

for index, row in metasamples.iterrows():
    sample_id = row['SAMPLE_ID']
    sample_patient = row['PATIENT_ID']
    tumor_only = all(metasamples[metasamples['PATIENT_ID'] == sample_patient]['TUMOR'] == 1)
    germline = row['TUMOR'] == 0
    # Retrieve variants
    if tumor_only:
        for elem in Path(PATH).rglob('*{}.vcf.gz'.format(sample_id)):
            if 'Mutect2_filtered' in str(elem):
                out_name = sample_id + '-' + sample_patient + '.vcf.gz'
                print('cp {} ../Results/Mutect2VCF/{}'.format(str(elem), out_name))
    elif germline:
        pass
    else:
        gdna = metasamples[(metasamples['PATIENT_ID'] == sample_patient)
                           & (metasamples['TUMOR'] == 0)]['SAMPLE_ID'].to_numpy()[0]
        for elem in chain(Path(PATH).rglob('*{}_vs_{}.vcf.gz'.format(sample_id, gdna))):
            if 'Mutect2_filtered' in str(elem):
                out_name = sample_id + '-' + sample_patient + '.vcf.gz'
                print('cp {} ../Results/Mutect2VCF/{}'.format(str(elem), out_name))

Loaded 28 samples
Searching for variants in ../Results/VariantCalling/
cp ../Results/VariantCalling/1123_3_cfDNA/Mutect2/Mutect2/Mutect2_filtered_1123_3_cfDNA.vcf.gz ../Results/Mutect2VCF/1123_3_cfDNA-VHIO_01.vcf.gz
cp ../Results/VariantCalling/1261_2_cfDNA/Mutect2/Mutect2/Mutect2_filtered_1261_2_cfDNA.vcf.gz ../Results/Mutect2VCF/1261_2_cfDNA-VHIO_01.vcf.gz
cp ../Results/VariantCalling/B19_0970__VH_17_B_022863_A5__1_tumor/Mutect2/Mutect2/Mutect2_filtered_B19_0970__VH_17_B_022863_A5__1_tumor.vcf.gz ../Results/Mutect2VCF/B19_0970__VH_17_B_022863_A5__1_tumor-VHIO_01.vcf.gz
cp ../Results/VariantCalling/1514_5_cfDNA/Mutect2/Mutect2/Mutect2_filtered_1514_5_cfDNA.vcf.gz ../Results/Mutect2VCF/1514_5_cfDNA-VHIO_02.vcf.gz
cp ../Results/VariantCalling/916_6_cfDNA/Mutect2/Mutect2/Mutect2_filtered_916_6_cfDNA.vcf.gz ../Results/Mutect2VCF/916_6_cfDNA-VHIO_02.vcf.gz
cp ../Results/VariantCalling/B19_0972__16_39495_A1__4_tumor/Mutect2/Mutect2/Mutect2_filtered_B19_0972__16_39495_A1__4_tumor.vcf.gz ../R