In [1]:
import os 
import pandas as pd
import subprocess

samples = pd.read_csv('../sarek/samples_all.tsv', sep="\t", header=None, index_col=None)
samples.columns = ['PATIENT_ID', 'GENDER', 'TUMOR', 'SAMPLE_ID', 'LANE', 'FASTA1', 'FASTA2']
samples = samples[(samples['LANE'] == 1) & samples['TUMOR'] == 1]

In [2]:
def exec_command(cmd, detach=False):
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    output, error = p.communicate()
    if p.returncode != 0:
        for line in output.decode("utf-8").split("\n") if output else "":
            print(line.rstrip())
        for line in error.decode("utf-8").split("\n") if error else "":
            print(line.rstrip())
        return False
    return True

In [5]:
REF = '../Homo_sapiens_assembly38.fasta'
for index, row in samples.iterrows():
    sample_id = row['SAMPLE_ID']
    mutect_vcf = 'VariantCalling/{0}/Mutect2/Mutect2_unfiltered_{0}.vcf.gz'.format(sample_id)
    mutect_tbi = 'VariantCalling/{0}/Mutect2/Mutect2_unfiltered_{0}.vcf.gz.tbi'.format(sample_id)
    mutect_stats = 'VariantCalling/{0}/Mutect2/{0}.vcf.gz.stats'.format(sample_id)
    mutect_cont = 'VariantCalling/{0}/Mutect2/{0}_contamination.table'.format(sample_id)
    if os.path.isfile(mutect_vcf) and os.path.isfile(mutect_tbi) \
    and os.path.isfile(mutect_stats) and os.path.isfile(mutect_cont):
        output = 'VariantCalling/{0}/Mutect2/Mutect2_filtered2_{0}.vcf.gz'.format(sample_id)
        cmd = 'gatk --java-options -Xmx4g FilterMutectCalls -V {} --max-events-in-region 10 ' \
        '--contamination-table {} --stats {} -R {} -O {}'.format(mutect_vcf, mutect_cont, mutect_stats, REF, output)
        success = exec_command(cmd)
        if not success:
            print('Error running FilterMutedCalls')
            break
    else:
        print('Missing Mutect2 files for {}'.format(sample_id))

Missing Mutect2 files for C20-0924
Missing Mutect2 files for B20-1091
Missing Mutect2 files for B20-1086
Missing Mutect2 files for B20-1084
Missing Mutect2 files for C20-0927
Missing Mutect2 files for C20-0923
Missing Mutect2 files for 4078
Missing Mutect2 files for B20-1088
Missing Mutect2 files for C20-1461
Missing Mutect2 files for 1197
Missing Mutect2 files for B20-1090
Missing Mutect2 files for B20-1097
Missing Mutect2 files for C20-0881
Missing Mutect2 files for B20-0499
Missing Mutect2 files for C20-930
Missing Mutect2 files for B20-1098


In [6]:
REF = '../Homo_sapiens_assembly38.fasta'
for index, row in samples.iterrows():
    sample_id = row['SAMPLE_ID']
    mutect_filtered_vcf = 'VariantCalling/{0}/Mutect2/Mutect2_filtered2_{0}.vcf.gz'.format(sample_id)
    if os.path.isfile(mutect_filtered_vcf):
        output = 'Annotation/{0}/snpEff/Mutect2_filtered2_{0}_snpEff.ann.vcf'.format(sample_id)
        os.makedirs('Annotation/{0}/snpEff'.format(sample_id), exist_ok=True)
        cmd = 'snpEff -Xmx4g GRCh38.86 -nodownload -canon -v {} > {}'.format(mutect_filtered_vcf, output)
        success = exec_command(cmd)
        if not success:
            print('Error running snpEff')
            break
    else:
        print('Missing Mutect2 filtered file for {}'.format(sample_id))

Missing Mutect2 filtered file for C20-0924
Missing Mutect2 filtered file for B20-1091
Missing Mutect2 filtered file for B20-1086
Missing Mutect2 filtered file for B20-1084
Missing Mutect2 filtered file for C20-0927
Missing Mutect2 filtered file for C20-0923
Missing Mutect2 filtered file for 4078
Missing Mutect2 filtered file for B20-1088
Missing Mutect2 filtered file for C20-1461
Missing Mutect2 filtered file for 1197
Missing Mutect2 filtered file for B20-1090
Missing Mutect2 filtered file for B20-1097
Missing Mutect2 filtered file for C20-0881
Missing Mutect2 filtered file for B20-0499
Missing Mutect2 filtered file for C20-930
Missing Mutect2 filtered file for B20-1098
