In [1]:
import pandas as pd
from cyvcf2 import VCF

In [2]:
vcf = list(VCF('../data/output/CellCut/vcf/GB115_Laurel-6_unique_snps.vcf'))



In [3]:
sample_names = [
    "GB115_Laurel-16", "GB115_Laurel-14", "GB115_Laurel-13", "GB115_Laurel-12",
    "GB115_Laurel-11", "GB115_Laurel-10", "GB115_Laurel-8", "GB115_Laurel-17",
    "GB115_Laurel-2", "GB115_Laurel-7", "GB115_Laurel-1", "GB115_Laurel-3",
    "GB115_Laurel-5", "GB115_Laurel-15", "GB115_Laurel-6"
]

In [5]:
# Initialize an empty list to store variants where Laurel-6 shares no genotypes with others
filtered_vcf = []

# Loop through all variants in the VCF
for v in vcf:
    if v.QUAL > 30 and v.INFO.get("NS") == 10:  # Check if QUAL > 20 and NS = 10
        # Get the genotype types for each sample
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Get the genotype for Laurel-6 (index of Laurel-6 in sample_names)
        laurel_genotype = None
        other_genotypes = []

        # Iterate through each sample and tally the genotypes
        for i in range(len(genotypes)):
            genotype = genotypes[i]  # Genotype for the i-th sample

            # Get the sample name based on the column order
            sample_name = sample_names[i]

            # Check if it's Laurel-6
            if sample_name == "GB115_Laurel-6":
                laurel_genotype = genotype
            else:
                other_genotypes.append((sample_name, genotype))

        # If we have both Laurel-6 and other samples, check for non-shared genotypes
        if (laurel_genotype == 1 and other_genotypes) or (laurel_genotype == 3 and other_genotypes):
            # Flag to track if Laurel-6 shares a genotype with any other sample
            no_shared_genotype = True

            # Compare Laurel-6's genotype with the other samples' genotypes
            for sample_name, other_genotype in other_genotypes:
                if laurel_genotype == other_genotype:
                    no_shared_genotype = False
                    break  # No need to check further, as we've found a shared genotype

            # If Laurel-6 shares no genotypes with the others, add to the filtered list
            if no_shared_genotype:
                filtered_vcf.append(v)

# Now the vcf variable contains only the variants where Laurel-6 shares no genotypes with others.


In [None]:
# Initialize accumulators for averages
total_dp = 0
total_ad = 0
total_ro = 0
count_variants = 0

# Initialize accumulators for Laurel-6 averages
laurel_dp = 0
laurel_ad = 0
laurel_ro = 0
laurel_count = 0

# Initialize accumulators for other samples averages
other_dp = 0
other_ad = 0
other_ro = 0
other_count = 0

# Loop through all variants in the VCF
for v in filtered_vcf:
    if v.QUAL > 20 and v.INFO.get("NS") == 10:  # Check if QUAL > 20 and NS = 10
        dp_values = []
        ad_values = []
        ro_values = []

        # Get the depth (DP) for the sample
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Get the depth values for the variant
        alt_depths = v.gt_alt_depths  # Alternative allele depths
        ref_depths = v.gt_ref_depths  # Reference allele depths
        bases = v.gt_bases  # Base calls for the variant

        # Iterate through each sample and gather the required information
        for i in range(len(genotypes)):
            sample_name = sample_names[i]
            # Get the total depth (sum of reference and alternate depths)
            dp = ref_depths[i] + alt_depths[i]
            dp_values.append(dp)

            # Calculate the allele depth (sum of ref and alt depths)
            ad = ref_depths[i] + alt_depths[i]
            ad_values.append(ad)

            # Reference allele depth
            ro = ref_depths[i]
            ro_values.append(ro)

            # Separate out averages based on the sample name
            if sample_name == "GB115_Laurel-6":
                laurel_dp += dp
                laurel_ad += ad
                laurel_ro += ro
                laurel_count += 1
            else:
                other_dp += dp
                other_ad += ad
                other_ro += ro
                other_count += 1

        # Calculate the average values for this variant
        avg_dp = sum(dp_values) / len(dp_values) if dp_values else 0
        avg_ad = sum(ad_values) / len(ad_values) if ad_values else 0
        avg_ro = sum(ro_values) / len(ro_values) if ro_values else 0

        # Add these averages to the accumulators
        total_dp += avg_dp
        total_ad += avg_ad
        total_ro += avg_ro

        # Increment the variant count
        count_variants += 1

# Calculate and print the average for all variants with NS=10
if count_variants > 0:
    avg_dp_all = total_dp / count_variants
    avg_ad_all = total_ad / count_variants
    avg_ro_all = total_ro / count_variants

    print(f"\nAverage DP for variants with NS=10: {avg_dp_all:.2f}")
    print(f"Average AD for variants with NS=10: {avg_ad_all:.2f}")
    print(f"Average RO for variants with NS=10: {avg_ro_all:.2f}")
else:
    print("No variants with NS=10 found.")

# Calculate and print the averages for Laurel-6
if laurel_count > 0:
    avg_dp_laurel = laurel_dp / laurel_count
    avg_ad_laurel = laurel_ad / laurel_count
    avg_ro_laurel = laurel_ro / laurel_count

    print(f"\nAverage DP for Laurel-6: {avg_dp_laurel:.2f}")
    print(f"Average AD for Laurel-6: {avg_ad_laurel:.2f}")
    print(f"Average RO for Laurel-6: {avg_ro_laurel:.2f}")
else:
    print("No variants found for Laurel-6.")

# Calculate and print the averages for other samples
if other_count > 0:
    avg_dp_other = other_dp / other_count
    avg_ad_other = other_ad / other_count
    avg_ro_other = other_ro / other_count

    print(f"\nAverage DP for other samples: {avg_dp_other:.2f}")
    print(f"Average AD for other samples: {avg_ad_other:.2f}")
    print(f"Average RO for other samples: {avg_ro_other:.2f}")
else:
    print("No variants found for other samples.")


In [None]:
for v in filtered_vcf:
    genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

    # Get the depth values for the variant
    alt_depths = v.gt_alt_depths  # Alternative allele depths
    ref_depths = v.gt_ref_depths  # Reference allele depths
    bases = v.gt_bases  # Base calls for the variant
    depths = v.INFO.get('DP')

    # Iterate through each sample and gather the required information
    for i in range(len(genotypes)):
        sample_name = sample_names[i]  # Get the sample name from the column header

        # Check if the current sample is Laurel-6
        if sample_name == "GB115_Laurel-6":
            laurel_genotype = genotypes[i]  # Get the genotype for Laurel-6

            # Get the depth of reference and alternate alleles for Laurel-6
            laurel_ref_depth = ref_depths[i]
            laurel_alt_depth = alt_depths[i]
            laurel_depth = laurel_ref_depth + laurel_alt_depth
            other_depth = depths - laurel_depth

            # Print details about the variant, genotypes, and Laurel-6 depth
            print(
                f"Laurel-6 Genotype: {laurel_genotype}, {laurel_depth}\n"
                f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                f"Genotypes (all samples): {genotypes}, {other_depth}\n"
                f"Quality: {v.QUAL:.2f}\n"
                f"Reference: {v.REF}\n"
                f"Alternate: {v.ALT}\n"
                f"Laurel-6 Ref Depth: {laurel_ref_depth}\n"
                f"Laurel-6 Alt Depth: {laurel_alt_depth}\n"
            )


In [25]:
# Parameters for filtering BASED ON EACH GENOTYPE
x = 2  # Minimum reference depth for Laurel-6
y = 2   # Minimum alternate depth for Laurel-6
z = 1   # Minimum reference depth for other genotypes



# Iterate through each variant in the filtered VCF
for v in filtered_vcf:
    genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

    # Get the depth values for the variant
    alt_depths = v.gt_alt_depths  # Alternative allele depths
    ref_depths = v.gt_ref_depths  # Reference allele depths
    depths = v.INFO.get('DP')  # Total depth for the variant

    # Initialize variables for filtering
    laurel_genotype = None
    laurel_ref_depth = None
    laurel_alt_depth = None
    other_ref_depths = []
    other_samples_pass = True  # Flag to check if all other samples meet the ref depth criterion

    # Iterate through each sample
    for i in range(len(genotypes)):
        sample_name = sample_names[i]  # Get the sample name from the column header

        # Check for Laurel-6
        if sample_name == "GB115_Laurel-6":
            laurel_genotype = genotypes[i]
            laurel_ref_depth = ref_depths[i]
            laurel_alt_depth = alt_depths[i]
        else:
            # Check reference depth for other genotypes
            if ref_depths[i] < z:
                other_samples_pass = False
            other_ref_depths.append(ref_depths[i])

    # Apply filtering conditions
    if (
        laurel_ref_depth is not None
        and laurel_alt_depth is not None
        and laurel_ref_depth > x
        and laurel_alt_depth > y
        and other_samples_pass
    ):
        # Print details if the variant passes all conditions
        print(
            f"Variant Passed Filtering\n"
            f"Position: {v.CHROM}, {v.start}, {v.end}\n"
            f"Laurel-6 Genotype: {laurel_genotype}\n"
            f"Genotypes (all samples): {genotypes}\n"
            f"Laurel-6 Ref Depth: {laurel_ref_depth}, Laurel-6 Alt Depth: {laurel_alt_depth}\n"
            f"Other Samples Ref Depths: {other_ref_depths}\n"
            f"Quality: {v.QUAL:.2f}\n"
            f"Reference: {v.REF}, Alternate: {v.ALT}\n"
        )


In [None]:
# Parameters for filtering BASED ON SUMMING OTHER GENOTYPES
x = 0  # Minimum reference depth for Laurel-6
y = 3   # Minimum alternate depth for Laurel-6
z = 3  # Minimum total reference depth for all other genotypes combined
d = 5 # Maximum total alternate depth for all other genotypes combined

# Iterate through each variant in the filtered VCF
for v in filtered_vcf:
    genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

    # Get the depth values for the variant
    alt_depths = v.gt_alt_depths  # Alternative allele depths
    ref_depths = v.gt_ref_depths  # Reference allele depths
    depths = v.INFO.get('DP')  # Total depth for the variant

    # Initialize variables for filtering
    laurel_genotype = None
    laurel_ref_depth = None
    laurel_alt_depth = None
    other_ref_depth_sum = 0  # To accumulate the total reference depth of other genotypes
    other_alt_depth_sum = 0

    # Iterate through each sample
    for i in range(len(genotypes)):
        sample_name = sample_names[i]  # Get the sample name from the column header

        # Check for Laurel-6
        if sample_name == "GB115_Laurel-6":
            laurel_genotype = genotypes[i]
            laurel_ref_depth = ref_depths[i]
            laurel_alt_depth = alt_depths[i]
        else:
            # Accumulate the reference depth for other samples
            other_ref_depth_sum += ref_depths[i]
            other_alt_depth_sum += alt_depths[i]

    # Apply filtering conditions
    if (
        laurel_ref_depth is not None
        and laurel_alt_depth is not None
        and laurel_ref_depth > x
        and laurel_alt_depth > y
        and other_ref_depth_sum > z
        and other_alt_depth_sum < d
    ):
        # Print details if the variant passes all conditions
        print(
            f"Variant Passed Filtering\n"
            f"Position: {v.CHROM}, {v.start}, {v.end}\n"
            f"Laurel-6 Genotype: {laurel_genotype}\n"
            f"Genotypes (all samples): {genotypes}\n"
            f"Laurel-6 Ref Depth: {laurel_ref_depth}, Laurel-6 Alt Depth: {laurel_alt_depth}\n"
            f"Total Ref Depth of Other Genotypes: {other_ref_depth_sum}\n"
            f"Total Alt Depth of Other Genotypes: {other_alt_depth_sum}\n"
            f"Quality: {v.QUAL:.2f}\n"
            f"Reference: {v.REF}, Alternate: {v.ALT}\n"
        )


In [7]:
import pandas as pd

# Read the input data
input_file = "/uufs/chpc.utah.edu/common/HIPAA/u1264408/u1264408/Git/SEMIColon/data/output/CellCut/vcf/qual_stats.txt"
data = pd.read_csv(input_file, sep="\t", header=None, skiprows=1,
                   names=["Quality", "number_of_SNPs", "number_of_transitions", "number_of_transversions", "number_of_indels"])

filtered_data = data[data["number_of_SNPs"] > 5]

In [None]:
# Parameters for filtering
m = 3   # Minimum number of samples with the same genotype
n = 6   # Upper limit of samples with the same genotype
x = 4  # Minimum reference depth for Laurel-6
y = 10   # Minimum alternate depth for Laurel-6
z = 100  # Minimum total reference depth for all other genotypes combined

# Iterate through each variant in the VCF
for v in vcf:
    # Only process variants meeting basic filters on quality and sample count
    if v.QUAL > 100 and v.INFO.get("NS") == 10:
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3, UNKNOWN=2)

        # Count the occurrences of each genotype
        genotype_counts = {0: 0, 1: 0, 3: 0, 2: 0}  # HOM_REF, HET, HOM_ALT, UNKNOWN
        for g in genotypes:
            if g in genotype_counts:
                genotype_counts[g] += 1

        # Check for a genotype that appears more than `m` times but fewer than `n` samples
        passing_genotype = None
        for genotype, count in genotype_counts.items():
            if m < count < n and genotype not in [0, 2]:  # More than `m` but fewer than `n`, excluding HOM_REF and UNKNOWN
                passing_genotype = genotype
                break

        # If a passing genotype is found, apply additional depth filters
        if passing_genotype is not None:
            alt_depths = v.gt_alt_depths  # Alternative allele depths
            ref_depths = v.gt_ref_depths  # Reference allele depths
            laurel_ref_depth = None
            laurel_alt_depth = None
            other_ref_depth_sum = 0  # To accumulate the total reference depth of other genotypes
            other_alt_depth_sum = 0
            valid = True  # Flag to ensure all conditions are met

            # Iterate through each sample
            for i in range(len(genotypes)):
                sample_name = sample_names[i]  # Get the sample name from the column header

                # Check for "Laurel-6"
                if sample_name == "GB115_Laurel-6":
                    laurel_genotype = genotypes[i]
                    laurel_ref_depth = ref_depths[i]
                    laurel_alt_depth = alt_depths[i]
                    # Apply Laurel-6 depth filters
                    if laurel_ref_depth <= x or laurel_alt_depth <= y or laurel_genotype != passing_genotype:
                        valid = False
                        break
                else:
                    # Accumulate reference depth for other samples
                    other_ref_depth_sum += ref_depths[i]
                    other_alt_depth_sum += alt_depths[i]

            # Ensure the total reference depth for other genotypes meets the threshold
            if valid and other_ref_depth_sum <= z:
                valid = False

            # Print details if the variant passes all conditions
            if valid:
                print(
                    f"Variant Passed Filtering\n"
                    f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                    f"Genotype: {passing_genotype} appears in {genotype_counts[passing_genotype]} samples\n"
                    f"Laurel-6 Genotype: {laurel_genotype}\n"
                    f"Genotypes (all samples): {genotypes}\n"
                    f"Quality: {v.QUAL:.2f}\n"
                    f"Reference: {v.REF}, Alternate: {v.ALT}\n"
                    f"Laurel-6 Ref Depth: {laurel_ref_depth}, Laurel-6 Alt Depth: {laurel_alt_depth}\n"
                    f"Total Ref Depth of Other Genotypes: {other_ref_depth_sum}\n"
                    f"Total Alt Depth of Other Genotypes: {other_alt_depth_sum}\n"

                )


In [None]:
# Parameters for filtering
m = 3   # Minimum number of samples with the same genotype
n = 6   # Upper limit of samples with the same genotype
x = 4   # Minimum reference depth for Laurel-6
y = 10  # Minimum alternate depth for Laurel-6
z = 100 # Minimum total reference depth for all other genotypes combined

# Iterate through each variant in the VCF
for v in vcf:
    # Only process variants meeting basic filters on quality and sample count
    if v.QUAL > 100 and v.INFO.get("NS") == 10:
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3, UNKNOWN=2)

        # Count the occurrences of each genotype
        genotype_counts = {0: 0, 1: 0, 3: 0, 2: 0}  # HOM_REF, HET, HOM_ALT, UNKNOWN
        for g in genotypes:
            if g in genotype_counts:
                genotype_counts[g] += 1

        # Check for a genotype that appears more than `m` times but fewer than `n` samples
        passing_genotype = None
        for genotype, count in genotype_counts.items():
            if m < count < n and genotype not in [0, 2]:  # More than `m` but fewer than `n`, excluding HOM_REF and UNKNOWN
                passing_genotype = genotype
                break

        # If a passing genotype is found, apply additional depth filters
        if passing_genotype is not None:
            alt_depths = v.gt_alt_depths  # Alternative allele depths
            ref_depths = v.gt_ref_depths  # Reference allele depths
            laurel_ref_depth = None
            laurel_alt_depth = None
            other_ref_depth_sum = 0  # To accumulate the total reference depth of other genotypes
            other_alt_depth_sum = 0
            valid = True  # Flag to ensure all conditions are met

            # Iterate through each sample
            for i in range(len(genotypes)):
                sample_name = sample_names[i]  # Get the sample name from the column header

                # Check for "Laurel-6"
                if sample_name == "GB115_Laurel-6":
                    laurel_genotype = genotypes[i]
                    laurel_ref_depth = ref_depths[i]
                    laurel_alt_depth = alt_depths[i]
                    # Apply Laurel-6 depth filters
                    if laurel_ref_depth <= x or laurel_alt_depth <= y or laurel_genotype != passing_genotype:
                        valid = False
                        break
                else:
                    # Accumulate reference depth for other samples
                    other_ref_depth_sum += ref_depths[i]
                    other_alt_depth_sum += alt_depths[i]

            # Ensure the total reference depth for other genotypes meets the threshold
            if valid and other_ref_depth_sum <= z:
                valid = False

            # Print details if the variant passes all conditions
            if valid:
                # Print the variant details
                print(f"Variant Passed Filtering\n"
                      f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                      f"Genotype: {passing_genotype} appears in {genotype_counts[passing_genotype]} samples\n"
                      f"Laurel-6 Genotype: {laurel_genotype}\n"
                      f"Genotypes (all samples): {genotypes}\n"
                      f"Quality: {v.QUAL:.2f}\n"
                      f"Reference: {v.REF}, Alternate: {v.ALT}\n"
                      f"Laurel-6 Ref Depth: {laurel_ref_depth}, Laurel-6 Alt Depth: {laurel_alt_depth}\n"
                      f"Total Ref Depth of Other Genotypes: {other_ref_depth_sum}\n"
                      f"Total Alt Depth of Other Genotypes: {other_alt_depth_sum}\n")

                # Print genotype for each sample
                for i, sample_name in enumerate(sample_names):
                    sample_genotype = genotypes[i]
                    print(f"Sample {sample_name}: Genotype {sample_genotype}")


In [4]:
# Parameters for filtering
m = 3   # Minimum number of samples with the same genotype
n = 6   # Upper limit of samples with the same genotype
x = 4   # Minimum reference depth for Laurel-6
y = 10  # Minimum alternate depth for Laurel-6
z = 100 # Minimum total reference depth for all other genotypes combined

# Iterate through each variant in the VCF
for v in vcf:
    # Skip variants where any genotype is UNKNOWN (value 2)
    if 2 in v.gt_types:  # Check if any genotype is 2
        continue

    # Only process variants meeting basic filters on quality and sample count
    if v.QUAL > 100 and v.INFO.get("NS") == 10:
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3)

        # Count the occurrences of each genotype
        genotype_counts = {0: 0, 1: 0, 3: 0}  # HOM_REF, HET, HOM_ALT
        for g in genotypes:
            if g in genotype_counts:
                genotype_counts[g] += 1

        # Check for a genotype that appears more than `m` times but fewer than `n` samples
        passing_genotype = None
        for genotype, count in genotype_counts.items():
            if m < count < n and genotype not in [0]:  # More than `m` but fewer than `n`, excluding HOM_REF
                passing_genotype = genotype
                break

        # If a passing genotype is found, apply additional depth filters
        if passing_genotype is not None:
            alt_depths = v.gt_alt_depths  # Alternative allele depths
            ref_depths = v.gt_ref_depths  # Reference allele depths
            laurel_ref_depth = None
            laurel_alt_depth = None
            other_ref_depth_sum = 0  # To accumulate the total reference depth of other genotypes
            other_alt_depth_sum = 0
            valid = True  # Flag to ensure all conditions are met

            # Iterate through each sample
            for i in range(len(genotypes)):
                sample_name = sample_names[i]  # Get the sample name from the column header

                # Check for "Laurel-6"
                if sample_name == "GB115_Laurel-6":
                    laurel_genotype = genotypes[i]
                    laurel_ref_depth = ref_depths[i]
                    laurel_alt_depth = alt_depths[i]
                    # Apply Laurel-6 depth filters
                    if laurel_ref_depth <= x or laurel_alt_depth <= y or laurel_genotype != passing_genotype:
                        valid = False
                        break
                else:
                    # Accumulate reference depth for other samples
                    other_ref_depth_sum += ref_depths[i]
                    other_alt_depth_sum += alt_depths[i]

            # Ensure the total reference depth for other genotypes meets the threshold
            if valid and other_ref_depth_sum <= z:
                valid = False

            # Print details if the variant passes all conditions
            if valid:
                # Print the variant details
                print(f"Variant Passed Filtering\n"
                      f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                      f"Genotype: {passing_genotype} appears in {genotype_counts[passing_genotype]} samples\n"
                      f"Laurel-6 Genotype: {laurel_genotype}\n"
                      f"Genotypes (all samples): {genotypes}\n"
                      f"Quality: {v.QUAL:.2f}\n")


In [None]:
# Parameters for filtering
m = 3   # Minimum number of samples with the same genotype
n = 6   # Upper limit of samples with the same genotype
x = 4   # Minimum reference depth for Laurel-6
y = 10  # Minimum alternate depth for Laurel-6
z = 100 # Minimum total reference depth for all other genotypes combined
q = 2 # Maximum combined alt depth for other genotypes

# Iterate through each variant in the VCF
for v in vcf:
    # Only process variants meeting basic filters on quality and sample count
    if v.QUAL > 100 and v.INFO.get("NS") == 10:
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3, UNKNOWN=2)

        # Count the occurrences of each genotype
        genotype_counts = {0: 0, 1: 0, 3: 0, 2: 0}  # HOM_REF, HET, HOM_ALT, UNKNOWN
        for g in genotypes:
            if g in genotype_counts:
                genotype_counts[g] += 1

        # Check for a genotype that appears more than `m` times but fewer than `n` samples
        passing_genotype = None
        for genotype, count in genotype_counts.items():
            if m < count < n and genotype not in [0, 2]:  # More than `m` but fewer than `n`, excluding HOM_REF and UNKNOWN
                passing_genotype = genotype
                break

        # If a passing genotype is found, apply additional depth filters
        if passing_genotype is not None:
            alt_depths = v.gt_alt_depths  # Alternative allele depths
            ref_depths = v.gt_ref_depths  # Reference allele depths
            laurel_ref_depth = None
            laurel_alt_depth = None
            other_ref_depth_sum = 0  # To accumulate the total reference depth of other genotypes
            other_alt_depth_sum = 0
            crypt_with_genotype_ref = 0
            crypt_with_genotype_alt = 0
            crypt_without_genotype_ref = 0
            crypt_without_genotype_alt = 0
            valid = True  # Flag to ensure all conditions are met

            # Iterate through each sample
            for i in range(len(genotypes)):
                sample_name = sample_names[i]  # Get the sample name from the column header
                sample_genotype = genotypes[i]

                # Check for "Laurel-6"
                if sample_name == "GB115_Laurel-6":
                    laurel_genotype = sample_genotype
                    laurel_ref_depth = ref_depths[i]
                    laurel_alt_depth = alt_depths[i]
                    # Apply Laurel-6 depth filters
                    if laurel_ref_depth <= x or laurel_alt_depth <= y or laurel_genotype != passing_genotype:
                        valid = False
                        break
                else:
                    # Sum depths for crypt samples with the same genotype
                    if sample_genotype == passing_genotype:
                        crypt_with_genotype_ref += ref_depths[i]
                        crypt_with_genotype_alt += alt_depths[i]
                    else:
                        crypt_without_genotype_ref += ref_depths[i]
                        crypt_without_genotype_alt += alt_depths[i]

            # Ensure the total reference depth for other genotypes meets the threshold
            if valid and crypt_without_genotype_ref <= z:
                valid = False

            # Print details if the variant passes all conditions
            if valid and crypt_without_genotype_alt < q:
                # Print the variant details
                print(f"Variant Passed Filtering\n"
                      f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                      f"Genotype: {passing_genotype} appears in {genotype_counts[passing_genotype]} samples\n"
                      f"Laurel-6 Genotype: {laurel_genotype}\n"
                      f"Genotypes (all samples): {genotypes}\n"
                      f"Quality: {v.QUAL:.2f}\n"
                      f"Reference: {v.REF}, Alternate: {v.ALT}\n"
                      f"Laurel-6 Ref Depth: {laurel_ref_depth}, Laurel-6 Alt Depth: {laurel_alt_depth}\n"
                      f"Crypt Samples With Genotype - Ref Depth: {crypt_with_genotype_ref}, Alt Depth: {crypt_with_genotype_alt}\n"
                      f"Crypt Samples Without Genotype - Ref Depth: {crypt_without_genotype_ref}, Alt Depth: {crypt_without_genotype_alt}\n")

                # Print genotype for each sample
                for i, sample_name in enumerate(sample_names):
                    sample_genotype = genotypes[i]
                    print(f"Sample {sample_name}: Genotype {sample_genotype}")


In [None]:
import scipy.stats as stats

def calculate_min_coverage(mut_freq, required_reads, confidence, num_samples):
    """
    Calculate the minimum coverage needed per sample to detect a unique mutation
    in at least one sample with a given confidence.

    Parameters:
        mut_freq (float): Mutation frequency (e.g., 0.01 for 1%).
        required_reads (int): Minimum number of reads needed to detect the mutation.
        confidence (float): Desired confidence level (e.g., 0.95 for 95% confidence).
        num_samples (int): Number of samples from the same donor.

    Returns:
        int: Minimum coverage depth required per sample.
    """
    for coverage in range(1, 30):  # Loop over possible coverage depths
        # Probability of missing the mutation in one sample
        prob_miss_single = sum(stats.binom.pmf(k, coverage, mut_freq) for k in range(required_reads))

        # Probability of missing the mutation in all samples
        prob_miss_all = prob_miss_single ** num_samples

        # Probability of detecting the mutation in at least one sample
        prob_detect = 1 - prob_miss_all

        if prob_detect >= confidence:
            return coverage
    return None  # If no depth satisfies the condition

# Parameters
mutation_frequency = 0.1  # Frequency of the unique mutation (1%)
min_reads = 3             # Minimum reads required to call the mutation
desired_confidence = 0.95 # Desired confidence level (95%)
num_samples = 5          # Number of samples from the same donor

# Calculate minimum coverage
min_coverage = calculate_min_coverage(mutation_frequency, min_reads, desired_confidence, num_samples)

print(f"Minimum coverage required per sample: {min_coverage}")


In [4]:
# List of specific samples of interest
laurel_samples = ["GB115_Laurel-2", "GB115_Laurel-1", "GB115_Laurel-3"]

# Iterate through each variant in the VCF
for v in vcf:
    # Only process variants meeting basic filters on quality and sample count
    if v.QUAL > 100 and v.INFO.get("NS") == 10:
        genotypes = v.gt_types  # Genotype types (HOM_REF=0, HET=1, HOM_ALT=3, UNKNOWN=2)

        # Extract genotypes for the laurel samples
        laurel_genotypes = {sample: genotypes[sample_names.index(sample)] for sample in laurel_samples}

        # Check if all Laurel samples share the same genotype
        if len(set(laurel_genotypes.values())) == 1:
            shared_genotype = list(laurel_genotypes.values())[0]  # The shared genotype

            # Check if this genotype is different from all other samples
            other_genotypes = [genotypes[i] for i, sample_name in enumerate(sample_names) if sample_name not in laurel_samples]

            if all(g != shared_genotype for g in other_genotypes):
                # Print details if the condition is met
                print(f"Variant Passed Filtering\n"
                      f"Position: {v.CHROM}, {v.start}, {v.end}\n"
                      f"Shared Genotype Among Laurel Samples: {shared_genotype}\n"
                      f"Genotypes of Laurel Samples: {laurel_genotypes}\n"
                      f"Genotypes of Other Samples: {other_genotypes}\n"
                      f"Quality: {v.QUAL:.2f}\n"
                      f"Reference: {v.REF}, Alternate: {v.ALT}")


Variant Passed Filtering
Position: chr1, 2366020, 2366021
Shared Genotype Among Laurel Samples: 3
Genotypes of Laurel Samples: {'GB115_Laurel-2': np.int32(3), 'GB115_Laurel-1': np.int32(3), 'GB115_Laurel-3': np.int32(3)}
Genotypes of Other Samples: [np.int32(2), np.int32(2), np.int32(2), np.int32(1), np.int32(1), np.int32(1), np.int32(1), np.int32(2), np.int32(1), np.int32(1), np.int32(2), np.int32(1)]
Quality: 413.96
Reference: T, Alternate: ['C']
Variant Passed Filtering
Position: chr1, 2655346, 2655347
Shared Genotype Among Laurel Samples: 0
Genotypes of Laurel Samples: {'GB115_Laurel-2': np.int32(0), 'GB115_Laurel-1': np.int32(0), 'GB115_Laurel-3': np.int32(0)}
Genotypes of Other Samples: [np.int32(2), np.int32(2), np.int32(2), np.int32(1), np.int32(1), np.int32(1), np.int32(1), np.int32(2), np.int32(1), np.int32(1), np.int32(2), np.int32(1)]
Quality: 405.42
Reference: C, Alternate: ['T']
Variant Passed Filtering
Position: chr1, 2693112, 2693113
Shared Genotype Among Laurel Samples