In [1]:
import pandas as pd
from cyvcf2 import VCF

In [2]:
vcf = list(VCF('../data/output/CellCut/vcf/GB115_Laurel-6_unique_snps.vcf'))



In [3]:
sample_names = [
    "GB115_Laurel-16", "GB115_Laurel-14", "GB115_Laurel-13", "GB115_Laurel-12",
    "GB115_Laurel-11", "GB115_Laurel-10", "GB115_Laurel-8", "GB115_Laurel-17",
    "GB115_Laurel-2", "GB115_Laurel-7", "GB115_Laurel-1", "GB115_Laurel-3",
    "GB115_Laurel-5", "GB115_Laurel-15", "GB115_Laurel-6"
]

In [8]:
# Initialize an empty list to store variants where Laurel-6 shares no genotypes with others
filtered_vcf = []

# Loop through all variants in the VCF
for v in vcf:
    if v.QUAL > 30 and v.INFO.get("NS") == 10:  # Check if QUAL > 20 and NS = 10
        # Get the genotype types for each sample
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Get the genotype for Laurel-6 (index of Laurel-6 in sample_names)
        laurel_genotype = None
        other_genotypes = []

        # Iterate through each sample and tally the genotypes
        for i in range(len(genotypes)):
            genotype = genotypes[i]  # Genotype for the i-th sample

            # Get the sample name based on the column order
            sample_name = sample_names[i]

            # Check if it's Laurel-6
            if sample_name == "GB115_Laurel-6":
                laurel_genotype = genotype
            else:
                other_genotypes.append((sample_name, genotype))

        # If we have both Laurel-6 and other samples, check for non-shared genotypes
        if (laurel_genotype == 1 and other_genotypes) or (laurel_genotype == 3 and other_genotypes):
            # Flag to track if Laurel-6 shares a genotype with any other sample
            no_shared_genotype = True

            # Compare Laurel-6's genotype with the other samples' genotypes
            for sample_name, other_genotype in other_genotypes:
                if laurel_genotype == other_genotype:
                    no_shared_genotype = False
                    break  # No need to check further, as we've found a shared genotype

            # If Laurel-6 shares no genotypes with the others, add to the filtered list
            if no_shared_genotype:
                filtered_vcf.append(v)

# Now the vcf variable contains only the variants where Laurel-6 shares no genotypes with others.


In [None]:
# Initialize accumulators for averages
total_dp = 0
total_ad = 0
total_ro = 0
count_variants = 0

# Initialize accumulators for Laurel-6 averages
laurel_dp = 0
laurel_ad = 0
laurel_ro = 0
laurel_count = 0

# Initialize accumulators for other samples averages
other_dp = 0
other_ad = 0
other_ro = 0
other_count = 0

# Loop through all variants in the VCF
for v in filtered_vcf:
    if v.QUAL > 20 and v.INFO.get("NS") == 10:  # Check if QUAL > 20 and NS = 10
        dp_values = []
        ad_values = []
        ro_values = []

        # Get the depth (DP) for the sample
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Get the depth values for the variant
        alt_depths = v.gt_alt_depths  # Alternative allele depths
        ref_depths = v.gt_ref_depths  # Reference allele depths
        bases = v.gt_bases  # Base calls for the variant

        # Iterate through each sample and gather the required information
        for i in range(len(genotypes)):
            sample_name = sample_names[i]
            # Get the total depth (sum of reference and alternate depths)
            dp = ref_depths[i] + alt_depths[i]
            dp_values.append(dp)

            # Calculate the allele depth (sum of ref and alt depths)
            ad = ref_depths[i] + alt_depths[i]
            ad_values.append(ad)

            # Reference allele depth
            ro = ref_depths[i]
            ro_values.append(ro)

            # Separate out averages based on the sample name
            if sample_name == "GB115_Laurel-6":
                laurel_dp += dp
                laurel_ad += ad
                laurel_ro += ro
                laurel_count += 1
            else:
                other_dp += dp
                other_ad += ad
                other_ro += ro
                other_count += 1

        # Calculate the average values for this variant
        avg_dp = sum(dp_values) / len(dp_values) if dp_values else 0
        avg_ad = sum(ad_values) / len(ad_values) if ad_values else 0
        avg_ro = sum(ro_values) / len(ro_values) if ro_values else 0

        # Add these averages to the accumulators
        total_dp += avg_dp
        total_ad += avg_ad
        total_ro += avg_ro

        # Increment the variant count
        count_variants += 1

# Calculate and print the average for all variants with NS=10
if count_variants > 0:
    avg_dp_all = total_dp / count_variants
    avg_ad_all = total_ad / count_variants
    avg_ro_all = total_ro / count_variants

    print(f"\nAverage DP for variants with NS=10: {avg_dp_all:.2f}")
    print(f"Average AD for variants with NS=10: {avg_ad_all:.2f}")
    print(f"Average RO for variants with NS=10: {avg_ro_all:.2f}")
else:
    print("No variants with NS=10 found.")

# Calculate and print the averages for Laurel-6
if laurel_count > 0:
    avg_dp_laurel = laurel_dp / laurel_count
    avg_ad_laurel = laurel_ad / laurel_count
    avg_ro_laurel = laurel_ro / laurel_count

    print(f"\nAverage DP for Laurel-6: {avg_dp_laurel:.2f}")
    print(f"Average AD for Laurel-6: {avg_ad_laurel:.2f}")
    print(f"Average RO for Laurel-6: {avg_ro_laurel:.2f}")
else:
    print("No variants found for Laurel-6.")

# Calculate and print the averages for other samples
if other_count > 0:
    avg_dp_other = other_dp / other_count
    avg_ad_other = other_ad / other_count
    avg_ro_other = other_ro / other_count

    print(f"\nAverage DP for other samples: {avg_dp_other:.2f}")
    print(f"Average AD for other samples: {avg_ad_other:.2f}")
    print(f"Average RO for other samples: {avg_ro_other:.2f}")
else:
    print("No variants found for other samples.")


In [37]:
for v in filtered_vcf:
    genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

    # Get the depth values for the variant
    alt_depths = v.gt_alt_depths  # Alternative allele depths
    ref_depths = v.gt_ref_depths  # Reference allele depths
    bases = v.gt_bases  # Base calls for the variant
    depths = v.INFO.get('DP')

    # Iterate through each sample and gather the required information
    for i in range(len(genotypes)):
        sample_name = sample_names[i]  # Get the sample name from the column header

        # Check if the current sample is Laurel-6
        if sample_name == "GB115_Laurel-6":
            laurel_genotype = genotypes[i]  # Get the genotype for Laurel-6

            # Get the depth of reference and alternate alleles for Laurel-6
            laurel_ref_depth = ref_depths[i]
            laurel_alt_depth = alt_depths[i]
            laurel_depth = laurel_ref_depth + laurel_alt_depth
            other_depth = depths - laurel_depth

            # Print details about the variant, genotypes, and Laurel-6 depth
            print(
                f"Laurel-6 Genotype: {laurel_genotype}, {laurel_depth}\n"
                f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                f"Genotypes (all samples): {genotypes}, {other_depth}\n"
                f"Quality: {v.QUAL:.2f}\n"
                f"Reference: {v.REF}\n"
                f"Alternate: {v.ALT}\n"
                f"Laurel-6 Ref Depth: {laurel_ref_depth}\n"
                f"Laurel-6 Alt Depth: {laurel_alt_depth}\n"
            )


Laurel-6 Genotype: 3, 12
Position: chr1, 3104543, 3104544
Genotypes (all samples): [2 2 2 0 1 1 1 2 1 1 1 1 1 2 3], 74
Quality: 1363.99
Reference: G
Alternate: ['A']
Laurel-6 Ref Depth: 2
Laurel-6 Alt Depth: 10

Laurel-6 Genotype: 3, 1
Position: chr1, 4618768, 4618769
Genotypes (all samples): [2 2 2 1 1 1 1 2 1 1 0 1 1 2 3], 35
Quality: 457.67
Reference: C
Alternate: ['G']
Laurel-6 Ref Depth: 0
Laurel-6 Alt Depth: 1

Laurel-6 Genotype: 1, 17
Position: chr1, 5202170, 5202171
Genotypes (all samples): [2 2 2 3 3 3 3 2 3 3 3 3 3 2 1], 49
Quality: 2189.89
Reference: G
Alternate: ['A']
Laurel-6 Ref Depth: 2
Laurel-6 Alt Depth: 15

Laurel-6 Genotype: 3, 3
Position: chr1, 8007443, 8007444
Genotypes (all samples): [2 2 2 0 0 0 1 2 1 0 0 1 0 2 3], 23
Quality: 157.29
Reference: T
Alternate: ['A']
Laurel-6 Ref Depth: 0
Laurel-6 Alt Depth: 3

Laurel-6 Genotype: 3, 9
Position: chr1, 8133371, 8133372
Genotypes (all samples): [2 2 2 0 1 1 1 2 1 1 0 1 1 2 3], 49
Quality: 783.90
Reference: G
Alternate: 