In [1]:
import pandas as pd
from cyvcf2 import VCF

In [2]:
dir_processed_sample_vcfs = '/scratch/ucgd/lustre-work/quinlan/data-shared/datasets/spermseq/final_seq/19610R_batch1and2_combined_wf/quinlan_analysis/custom/processed_vcfs/sample_vcfs'
fp_metadata = '/scratch/ucgd/lustre-work/quinlan/data-shared/datasets/spermseq/final_seq/19610R_batch1and2_combined_wf/quinlan_analysis/custom/metadata/reliable_samples_MRs.csv'

In [3]:
samps_df = pd.read_csv(fp_metadata)
reliable_samps_df = samps_df.loc[samps_df['reliable']]
reliable_samps_df[:5]

Unnamed: 0,donor_ID,sample_ID,input_DNA,reliable,collection_date,analysis,type,age,tissue_type,mean_no_call_rate,...,genomic_mutation_rate_SNV_intronic,genomic_mutation_rate_SNV_exonic,genomic_mutation_rate_INDEL_intronic,genomic_mutation_rate_INDEL_exonic,num_mutations,num_unique_inherited_mutations,num_clonal_mutations,num_non_clonal_mutations,num_pathogenic_mutations,duplex_bases
0,1SCS014,19610X1.1,200 ng,True,4/23/10,cross-sectional,paired samples,35.9,Sperm DNA,0.118964,...,2.153297e-08,3.866146e-08,9.787712e-10,9.787712e-10,127,13,2,125,0,2043378475
1,1SCS014,19610X2.1,200 ng,True,4/23/10,cross-sectional,paired samples,35.9,Blood DNA,0.100579,...,3.828919e-08,7.864807e-08,3.104529e-09,2.069686e-09,236,13,3,233,7,1932660251
10,D091_IX8Y012,19610X13.1,200 ng,True,3/24/95,longitudinal,1st collection,52.0,Sperm DNA,0.09229,...,2.013937e-08,3.812095e-08,7.192633e-10,4.31558e-09,88,2,0,88,3,1390311543
11,D091_IX8Y012,19610X14.1,200 ng,True,9/6/11,longitudinal,last collection,68.0,Blood DNA,0.065349,...,1.390663e-07,2.074406e-07,2.317772e-09,2.317772e-09,303,2,12,291,1,862897462
12,D091_IX8Y012,19610X15.1,200 ng,True,9/20/11,longitudinal,last collection,68.0,Sperm DNA,0.086492,...,2.149925e-08,5.486015e-08,2.965414e-09,4.44812e-09,113,2,0,113,5,1348884351


In [4]:
### subset metadata df to only sperm samples
reliable_sperm_samps_df = reliable_samps_df.loc[reliable_samps_df['tissue_type'] == 'Sperm DNA']

### subset metadata df to only blood samples
reliable_blood_samps_df = reliable_samps_df.loc[reliable_samps_df['tissue_type'] == 'Blood DNA']

In [9]:
vcf = list(VCF('/scratch/ucgd/lustre-work/quinlan/data-shared/datasets/spermseq/final_seq/19610R_batch1and2_combined_wf/quinlan_analysis/custom/processed_vcfs/sample_vcfs/19610X1.1/19610X1.1.variants_filtered.vcf.gz'))
variant = vcf[123]
# *** variant.POS gives you vcf coordinate; variant.start and variant.end give you bed coordinates)
print(variant.CHROM, variant.POS, variant.start, variant.end, variant.REF, variant.ALT)
print(variant.INFO.get('variation_type'))
print(variant.format('filter')[0]) # this will either be PASS or a semi-colon separated list of filters this variant failed
print('alt_depth: ', variant.format('alt_depth')[0][0])
print('VAF: ', variant.format('AF')[0][0])
print('context: ', variant.format('context')[0])

# *** variant.format('some_field') returns a list of values, one for each sample represented in the VCF. since the sample vcfs only have one sample column, always use index [0] for those. If using the donor vcfs or any vcf with multiple sample columns index accordingly


chr4 54732043 54732042 54732044 TG ['T']
indel
shared_blood_sperm;num_donors_with_mut_gt_1;
alt_depth:  1
VAF:  0.0007824726
context:  TTG


In [6]:
def is_CpG_to_TpG(v):
    return (v.format('context')[0][1:] == 'CG' and v.INFO.get('subtype') == 'C>T') or (v.format('context')[0][1:] == 'GC' and v.INFO.get('subtype') == 'G>A')

CpGs = [variant for variant in vcf if is_CpG_to_TpG(variant)]

for variant in CpGs[:5]:
    print(variant.CHROM, variant.POS, variant.INFO.get('subtype'), variant.format('context')[0])

chr1 243572952 C>T CCG
chr2 47798625 C>T ACG
chr3 9756552 C>T ACG
chr3 41233763 C>T CCG
chr5 80873118 G>A CGC


In [7]:
### iterate through reliable samples, get processed VCF, and subset VCF

for idx,row in reliable_samps_df.iterrows():

    sample_ID = row['sample_ID']
    tissue_type = row['tissue_type']

    fp_vcf = '{}/{}/{}.variants_filtered.vcf.gz'.format(dir_processed_sample_vcfs, sample_ID, sample_ID) # the '.variants_filtered.vcf.gz' files are easier to look at, but if you want VEP and ClinVar annotations choose 'variants_filtered_annotated.vcf.gz'
    sample_vcf = list(VCF(fp_vcf)) # ***if you want to iterate over the VCF object more than once, cast it as a list***

    ### get SNVs only
    SNVs = [v for v in sample_vcf if v.INFO.get('variation_type') == 'snv']

    ### get clonal variants only
    clonal = [v for v in sample_vcf if v.format('alt_depth')[0] > 1]

    ### get nonclonal variants only
    nonclonal = [v for v in sample_vcf if v.format('alt_depth')[0] == 1]

    ### get PASS mutations (SNVs and INDELs that pass quality filters and do not recur across samples)
    pass_mutations = [v for v in sample_vcf if v.format('filter')[0] == 'PASS']

    ### get PASS mutations AND mutations that recur across samples (can be recurrent in any tissue)
    allow_shared_donor = [v for v in sample_vcf if v.format('filter')[0] in ['PASS', 'num_donors_with_mut_gt_1;']]

    ### get PASS mutations AND mutations that recur only in samples of the same tissue type (e.g. if sample_ID is a sperm sample, only allow variants that recur in sperm)
    other_tissue = 'blood' if tissue_type == 'Sperm DNA'else 'sperm'
    allow_shared_tissue = [v for v in sample_vcf if v.format('filter')[0] in ['PASS', 'num_donors_with_mut_gt_1;'] and v.INFO.get('num_donors_with_variant_at_this_pos_in_'+other_tissue)==0]