In [4]:
import pandas as pd
from cyvcf2 import VCF

In [33]:
vcf = list(VCF('../data/output/CellCut/vcf/GB115_Laurel-6_unique_snps.vcf'))
sample_names = [
    "GB115_Laurel-16", "GB115_Laurel-14", "GB115_Laurel-13", "GB115_Laurel-12",
    "GB115_Laurel-11", "GB115_Laurel-10", "GB115_Laurel-8", "GB115_Laurel-17",
    "GB115_Laurel-2", "GB115_Laurel-7", "GB115_Laurel-1", "GB115_Laurel-3",
    "GB115_Laurel-5", "GB115_Laurel-15", "GB115_Laurel-6"
]


In [7]:
variant = vcf[0]
# *** variant.POS gives you vcf coordinate; variant.start and variant.end give you bed coordinates)
print(variant.CHROM, variant.POS, variant.start, variant.end, variant.REF, variant.ALT)
# "Number of samples with data"
print(variant.INFO.get('NS'))
# Total read depth at the locus"
print(variant.INFO.get('DP'))
# depth by base
print(variant.INFO.get('DPB'))

print(variant.format("GT"))

# "Total number of alleles in called genotypes"
print(variant.INFO.get('AN'))



chr1 10138 10137 10138 T ['C']
10
140
140.0
['' '' '' '\x02\x02' '\x02\x02' '\x02\x02' '\x02\x04' '' '\x02\x02'
 '\x02\x02' '\x02\x02' '\x02\x02' '\x02\x02' '' '\x02\x02']
20


In [36]:
# Initialize accumulators for averages
total_dp = 0
total_ad = 0
total_ro = 0
total_qual = 0
count_variants = 0

# Initialize accumulators for Laurel-6 averages
laurel_dp = 0
laurel_ad = 0
laurel_ro = 0
laurel_qual = 0
laurel_count = 0

# Initialize accumulators for other samples averages
other_dp = 0
other_ad = 0
other_ro = 0
other_qual = 0
other_count = 0

# Loop through all variants in the VCF
for v in vcf:
    if v.QUAL > 20 and v.INFO.get("NS") == 10:  # Check if QUAL > 20 and NS = 10
        dp_values = []
        ad_values = []
        ro_values = []

        # Get the depth (DP) for the sample
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Get the depth values for the variant
        alt_depths = v.gt_alt_depths  # Alternative allele depths
        ref_depths = v.gt_ref_depths  # Reference allele depths
        bases = v.gt_bases  # Base calls for the variant

        # Iterate through each sample and gather the required information
        for i in range(len(genotypes)):
            sample_name = sample_names[i]
            # Get the total depth (sum of reference and alternate depths)
            dp = ref_depths[i] + alt_depths[i]
            dp_values.append(dp)

            # Calculate the allele depth (sum of ref and alt depths)
            ad = ref_depths[i] + alt_depths[i]
            ad_values.append(ad)

            # Reference allele depth
            ro = ref_depths[i]
            ro_values.append(ro)

            # Separate out averages based on the sample name
            if sample_name == "GB115_Laurel-6":
                laurel_dp += dp
                laurel_ad += ad
                laurel_ro += ro
                laurel_qual += v.QUAL
                laurel_count += 1
            else:
                other_dp += dp
                other_ad += ad
                other_ro += ro
                other_qual += v.QUAL
                other_count += 1

        # Calculate the average values for this variant
        avg_dp = sum(dp_values) / len(dp_values) if dp_values else 0
        avg_ad = sum(ad_values) / len(ad_values) if ad_values else 0
        avg_ro = sum(ro_values) / len(ro_values) if ro_values else 0

        # Add these averages to the accumulators
        total_dp += avg_dp
        total_ad += avg_ad
        total_ro += avg_ro
        total_qual += v.QUAL  # Accumulate quality value

        # Increment the variant count
        count_variants += 1

# Calculate and print the average for all variants with NS=10
if count_variants > 0:
    avg_dp_all = total_dp / count_variants
    avg_ad_all = total_ad / count_variants
    avg_ro_all = total_ro / count_variants
    avg_qual_all = total_qual / count_variants

    print(f"\nAverage DP for variants with NS=10: {avg_dp_all:.2f}")
    print(f"Average AD for variants with NS=10: {avg_ad_all:.2f}")
    print(f"Average RO for variants with NS=10: {avg_ro_all:.2f}")
    print(f"Average Quality for variants with NS=10: {avg_qual_all:.2f}")
else:
    print("No variants with NS=10 found.")

# Calculate and print the averages for Laurel-6
if laurel_count > 0:
    avg_dp_laurel = laurel_dp / laurel_count
    avg_ad_laurel = laurel_ad / laurel_count
    avg_ro_laurel = laurel_ro / laurel_count
    avg_qual_laurel = laurel_qual / laurel_count

    print(f"\nAverage DP for Laurel-6: {avg_dp_laurel:.2f}")
    print(f"Average AD for Laurel-6: {avg_ad_laurel:.2f}")
    print(f"Average RO for Laurel-6: {avg_ro_laurel:.2f}")
    print(f"Average Quality for Laurel-6: {avg_qual_laurel:.2f}")
else:
    print("No variants found for Laurel-6.")

# Calculate and print the averages for other samples
if other_count > 0:
    avg_dp_other = other_dp / other_count
    avg_ad_other = other_ad / other_count
    avg_ro_other = other_ro / other_count
    avg_qual_other = other_qual / other_count

    print(f"\nAverage DP for other samples: {avg_dp_other:.2f}")
    print(f"Average AD for other samples: {avg_ad_other:.2f}")
    print(f"Average RO for other samples: {avg_ro_other:.2f}")
    print(f"Average Quality for other samples: {avg_qual_other:.2f}")
else:
    print("No variants found for other samples.")



Average DP for variants with NS=10: 5.70
Average AD for variants with NS=10: 5.70
Average RO for variants with NS=10: 1.88
Average Quality for variants with NS=10: 1966.91

Average DP for Laurel-6: 21.97
Average AD for Laurel-6: 21.97
Average RO for Laurel-6: 7.66
Average Quality for Laurel-6: 1966.91

Average DP for other samples: 4.54
Average AD for other samples: 4.54
Average RO for other samples: 1.47
Average Quality for other samples: 1966.91


In [8]:
phases = variant.gt_phases
quals = variant.gt_quals,  # numpy array

In [9]:


# Print the Genotype Quality (GQ)
print(f"Genotype Quality (GQ), Phred-scaled marginal probability of the called genotype: {variant.format('GQ')}")

# Print the Genotype Likelihood (GL)
print(f"Genotype Likelihood (GL), log10-scaled likelihoods of the data given the called genotype: {variant.format('GL')}")

# Print the Read Depth (DP)
print(f"Read Depth (DP), the total number of reads covering this site: {variant.format('DP')}")

# Print the Number of Observations for each Allele (AD)
print(f"Allele Depth (AD), number of observations for each allele (reference and alternate): {variant.format('AD')}")

# Print the Reference Allele Observation Count (RO)
print(f"Reference Allele Observation Count (RO), the number of reads supporting the reference allele: {variant.format('RO')}")

# Print the Sum of Quality of Reference Observations (QR)
print(f"Sum of Quality of Reference Observations (QR), sum of quality scores for reference allele reads: {variant.format('QR')}")

# Print the Alternate Allele Observation Count (AO)
print(f"Alternate Allele Observation Count (AO), the number of reads supporting the alternate allele: {variant.format('AO')}")

# Print the Sum of Quality of Alternate Observations (QA)
print(f"Sum of Quality of Alternate Observations (QA), sum of quality scores for alternate allele reads: {variant.format('QA')}")



Genotype Quality (GQ), Phred-scaled marginal probability of the called genotype: None
Genotype Likelihood (GL), log10-scaled likelihoods of the data given the called genotype: [[      nan       nan       nan]
 [      nan       nan       nan]
 [      nan       nan       nan]
 [  0.       -2.60316 -18.3039 ]
 [  0.       -2.70927  -7.45676]
 [  0.       -1.61475 -10.3175 ]
 [ -1.14554   0.      -10.621  ]
 [      nan       nan       nan]
 [  0.       -5.71957 -32.5009 ]
 [  0.       -3.61236 -18.882  ]
 [  0.       -4.81648 -30.2115 ]
 [  0.       -3.31133 -15.0847 ]
 [  0.       -3.0103  -16.9262 ]
 [      nan       nan       nan]
 [  0.       -4.87905 -30.2097 ]]
Read Depth (DP), the total number of reads covering this site: [[-2147483648]
 [-2147483648]
 [-2147483648]
 [         14]
 [          9]
 [         13]
 [         11]
 [-2147483648]
 [         19]
 [         12]
 [         16]
 [         11]
 [         10]
 [-2147483648]
 [         25]]
Allele Depth (AD), number of observatio

In [13]:
# gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
genotypes = variant.gt_types
alt_depths = variant.gt_alt_depths
ref_depths = variant.gt_ref_depths
bases = variant.gt_bases

In [10]:
# "Number of samples with data"
print("Number of samples with data:", variant.INFO.get('NS'))

# "Total read depth at the locus"
print("Total read depth at the locus:", variant.INFO.get('DP'))

# "Total read depth per base"
print("Total read depth per base:", variant.INFO.get('DPB'))

# "Total number of alternate alleles in called genotypes"
print("Total number of alternate alleles:", variant.INFO.get('AC'))

# "Total number of alleles in called genotypes"
print("Total number of alleles:", variant.INFO.get('AN'))

# "Estimated allele frequency"
print("Estimated allele frequency:", variant.INFO.get('AF'))

# "Reference allele observation count (full)"
print("Reference allele observation count (full):", variant.INFO.get('RO'))

# "Alternate allele observation count (full)"
print("Alternate allele observation count (full):", variant.INFO.get('AO'))

# "Reference allele observation count (partial)"
print("Reference allele observation count (partial):", variant.INFO.get('PRO'))

# "Alternate allele observation count (partial)"
print("Alternate allele observation count (partial):", variant.INFO.get('PAO'))

# "Reference allele quality sum"
print("Reference allele quality sum (phred):", variant.INFO.get('QR'))

# "Alternate allele quality sum"
print("Alternate allele quality sum (phred):", variant.INFO.get('QA'))

# "Reference allele quality sum (partial)"
print("Reference allele quality sum (partial, phred):", variant.INFO.get('PQR'))

# "Alternate allele quality sum (partial)"
print("Alternate allele quality sum (partial, phred):", variant.INFO.get('PQA'))

# "Number of reference observations on the forward strand"
print("Reference observations (forward strand):", variant.INFO.get('SRF'))

# "Number of reference observations on the reverse strand"
print("Reference observations (reverse strand):", variant.INFO.get('SRR'))

# "Number of alternate observations on the forward strand"
print("Alternate observations (forward strand):", variant.INFO.get('SAF'))

# "Number of alternate observations on the reverse strand"
print("Alternate observations (reverse strand):", variant.INFO.get('SAR'))


Number of samples with data: 10
Total read depth at the locus: 140
Total read depth per base: 140.0
Total number of alternate alleles: 1
Total number of alleles: 20
Estimated allele frequency: 0.05000000074505806
Reference allele observation count (full): 130
Alternate allele observation count (full): 10
Reference allele observation count (partial): 0.0
Alternate allele observation count (partial): 0.0
Reference allele quality sum (phred): 4392
Alternate allele quality sum (phred): 160
Reference allele quality sum (partial, phred): 0.0
Alternate allele quality sum (partial, phred): 0.0
Reference observations (forward strand): 44
Reference observations (reverse strand): 86
Alternate observations (forward strand): 9
Alternate observations (reverse strand): 1


In [11]:
# "Strand balance probability for the reference allele"
print("Strand balance probability (reference allele):", variant.INFO.get('SRP'))

# "Strand balance probability for the alternate allele"
print("Strand balance probability (alternate allele):", variant.INFO.get('SAP'))

# "Allele balance at heterozygous sites"
print("Allele balance (heterozygous sites):", variant.INFO.get('AB'))

# "Allele balance probability at heterozygous sites"
print("Allele balance probability (heterozygous sites):", variant.INFO.get('ABP'))

# "Run length: number of consecutive repeats of the alternate allele in the reference genome"
print("Run length (alternate allele):", variant.INFO.get('RUN'))

# "Read Placement Probability for alternate alleles"
print("Read Placement Probability (alternate allele):", variant.INFO.get('RPP'))

# "Read Placement Probability for reference observations"
print("Read Placement Probability (reference observations):", variant.INFO.get('RPPR'))

# "Reads Placed Left: supporting alternate allele"
print("Reads Placed Left (supporting alternate):", variant.INFO.get('RPL'))

# "Reads Placed Right: supporting alternate allele"
print("Reads Placed Right (supporting alternate):", variant.INFO.get('RPR'))

# "End Placement Probability for alternate alleles"
print("End Placement Probability (alternate allele):", variant.INFO.get('EPP'))

# "End Placement Probability for reference observations"
print("End Placement Probability (reference observations):", variant.INFO.get('EPPR'))

# "Alternate allele depth ratio"
print("Alternate allele depth ratio:", variant.INFO.get('DPRA'))

# "Log odds ratio of best genotype combination"
print("Log odds ratio of best genotype combination:", variant.INFO.get('ODDS'))

# "Number of genotyping iterations to convergence or bailout"
print("Genotyping iterations to convergence or bailout:", variant.INFO.get('GTI'))

# "Type of allele (snp, mnp, ins, del, complex)"
print("Type of allele:", variant.INFO.get('TYPE'))

# "Extended CIGAR representation of alternate allele"
print("CIGAR representation of alternate allele:", variant.INFO.get('CIGAR'))

# "Number of unique non-reference alleles"
print("Number of unique non-reference alleles:", variant.INFO.get('NUMALT'))

# "Mean number of non-reference allele observations per sample"
print("Mean number of non-reference allele observations per sample:", variant.INFO.get('MEANALT'))

# "Allele length"
print("Allele length:", variant.INFO.get('LEN'))

# "Mean mapping quality of observed alternate alleles"
print("Mean mapping quality (alternate alleles):", variant.INFO.get('MQM'))

# "Mean mapping quality of observed reference alleles"
print("Mean mapping quality (reference alleles):", variant.INFO.get('MQMR'))

# "Proportion of observed alternate alleles supported by properly paired reads"
print("Proportion of alternate alleles supported by paired reads:", variant.INFO.get('PAIRED'))

# "Proportion of observed reference alleles supported by properly paired reads"
print("Proportion of reference alleles supported by paired reads:", variant.INFO.get('PAIREDR'))

# "Minimum depth in gVCF output block"
print("Minimum depth in gVCF output block:", variant.INFO.get('MIN_DP'))

# "Last position in gVCF output record"
print("Last position in gVCF output record:", variant.INFO.get('END'))

# "Fraction of alternate allele observations in reads from Illumina"
print("Fraction of alternate observations from Illumina reads:", variant.INFO.get('technology.Illumina'))


Strand balance probability (reference allele): 32.47549819946289
Strand balance probability (alternate allele): 16.907699584960938
Allele balance (heterozygous sites): 0.27272701263427734
Allele balance probability (heterozygous sites): 7.945459842681885
Run length (alternate allele): 1
Read Placement Probability (alternate allele): 24.725000381469727
Read Placement Probability (reference observations): 35.3484992980957
Reads Placed Left (supporting alternate): 10.0
Reads Placed Right (supporting alternate): 0.0
End Placement Probability (alternate allele): 16.907699584960938
End Placement Probability (reference observations): 3.277559995651245
Alternate allele depth ratio: 1.2272700071334839
Log odds ratio of best genotype combination: 13.949000358581543
Genotyping iterations to convergence or bailout: 0
Type of allele: snp
CIGAR representation of alternate allele: 1X
Number of unique non-reference alleles: 1
Mean number of non-reference allele observations per sample: 1.0
Allele leng

In [None]:
# gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
HOM_REF=0
HET=1
UNKNOWN=2
HOM_ALT=3

blood_col = 1
sperm1_col = 0
sperm2_col = 2

# variant.gt_types, variant.gt_ref_depths, variant.gt_alt_depths # numpy arrays
# variant.gt_phases, variant.gt_quals, variant.gt_bases # numpy array

for v in vcf:
    youngsperm, blood, oldsperm, ys_altdepth, os_altdepth, blood_altdepth, ys_refdepth, os_refdepth, blood_refdepth, ys_bases, os_bases, blood_bases = get_info_v(v,
                                                                                                                        blood_col, sperm1_col, sperm2_col)
    if v.QUAL>20 and v.INFO.get("NS") > 1 and v.INFO.get("DP") > 10 and get_interesting(v, youngsperm, blood, oldsperm):
        print("blood", blood_bases, "alt depth", blood_altdepth, "ref depth", blood_refdepth, "young sperm", ys_bases, "alt depth", ys_altdepth,
                  "ref depth", ys_refdepth, "old sperm",os_bases, "alt depth", os_altdepth, "ref depth", os_refdepth)
        #print("blood", "sperm1", "sperm2")
        #print(v.INFO.get("AC"))
            #print(v.gt_types, ys_altdepth, os_altdepth, v.QUAL)
            #print(os_refdepth, ys_refdepth, blood_refdepth)

In [None]:
for v in vcf:
    # Get the necessary info
    genotypes = v.gt_types

    # Assign sample context: GT
    youngsperm = genotypes[0]
    blood = genotypes[1]
    oldsperm = genotypes[2]

    # Access FORMAT fields
    ro_blood = v.format('RO')[1]
    ao_blood = v.format('AO')[1]

    ro_youngsperm = v.format('RO')[0]
    ao_youngsperm = v.format('AO')[0]

    ro_oldsperm = v.format('RO')[2]
    ao_oldsperm = v.format('AO')[2]

    if v.QUAL>20 and v.INFO.get("NS") > 1 and v.INFO.get("DP") > 10 and get_interesting(v, youngsperm, blood, oldsperm):

    # Print the FORMAT fields
        print(f"Blood - RO: {ro_blood}, AO: {ao_blood}")
        print(f"Young Sperm - RO: {ro_youngsperm}, AO: {ao_youngsperm}")
        print(f"Old Sperm - RO: {ro_oldsperm}, AO: {ao_oldsperm}")

