In [8]:
from pathlib import Path
import pandas as pd
from itertools import chain

PATH = '../Results/VariantCalling/'

metasamples = pd.read_csv('../sarek/samples_all.tsv', sep="\t", header=None)
metasamples.columns = ['PATIENT_ID', 'GENDER', "TUMOR", "SAMPLE_ID", "LANE", "FASTQ1", "FASTQ2"]
metasamples['SAMPLE_ID'] = metasamples['SAMPLE_ID'].astype(str)
metasamples = metasamples.drop_duplicates(subset=['SAMPLE_ID'])
metasamples.index = [x.strip() for x in metasamples['SAMPLE_ID'] + '-' + metasamples['PATIENT_ID']]

metadata = pd.read_excel('../Metadata.xlsx')
metadata['SAMPLE_ID'] = metadata['SAMPLE_ID'].astype(str)
metadata.index = [x.strip() for x in metadata['SAMPLE_ID'] + '-' + metadata['PATIENT_ID']]
# Keep only samples that are tumor-normal pair and labeled as included
meta_filtered = metadata.loc[(metadata['INCLUDED'] == "Y")
                             & (metadata['TUMOR_ONLY'] == "No")
                             & (metadata['VAR2'].notnull())
                             & (metadata['SAMPLE_TYPE'] != "gDNA"), :]

metasamples = metasamples.reindex(meta_filtered.index)

print('Loaded {} samples'.format(metasamples.shape[0]))
print('Searching for variants in {}'.format(PATH))

for index, row in metasamples.iterrows():
    sample_id = row['SAMPLE_ID']
    sample_patient = row['PATIENT_ID']
    gdna = metadata[(metadata['PATIENT_ID'] == sample_patient)
                    & (metadata['SAMPLE_TYPE'] == 'gDNA')]['SAMPLE_ID'].to_numpy()[0]
    for elem in chain(Path(PATH).rglob('*{}_vs_{}.vcf.gz'.format(sample_id, gdna))):
        if 'Mutect2_filtered' in str(elem):
            out_name = sample_id + '-' + sample_patient + '.vcf.gz'
            print('cp {} ../Mutect2VCF/{}'.format(str(elem), out_name))

Loaded 40 samples
Searching for variants in ../Results/VariantCalling/
cp ../Results/VariantCalling/1123_3_cfDNA_vs_B19_0971__17_22863_A1__0_gDNA/Mutect2/Mutect2_filtered_1123_3_cfDNA_vs_B19_0971__17_22863_A1__0_gDNA.vcf.gz ../Mutect2VCF/1123_3_cfDNA-VHIO_01.vcf.gz
cp ../Results/VariantCalling/1261_2_cfDNA_vs_B19_0971__17_22863_A1__0_gDNA/Mutect2/Mutect2_filtered_1261_2_cfDNA_vs_B19_0971__17_22863_A1__0_gDNA.vcf.gz ../Mutect2VCF/1261_2_cfDNA-VHIO_01.vcf.gz
cp ../Results/VariantCalling/B19_0970__VH_17_B_022863_A5__1_tumor_vs_B19_0971__17_22863_A1__0_gDNA/Mutect2/Mutect2_filtered_B19_0970__VH_17_B_022863_A5__1_tumor_vs_B19_0971__17_22863_A1__0_gDNA.vcf.gz ../Mutect2VCF/B19_0970__VH_17_B_022863_A5__1_tumor-VHIO_01.vcf.gz
cp ../Results/VariantCalling/3031_11_cfDNA_vs_L19_5550_gDNA/Mutect2/Mutect2_filtered_3031_11_cfDNA_vs_L19_5550_gDNA.vcf.gz ../Mutect2VCF/3031_11_cfDNA-VHIO_04.vcf.gz
cp ../Results/VariantCalling/C20-0924_vs_L19_5550_gDNA/Mutect2/Mutect2_filtered_C20-0924_vs_L19_5550_gDNA.