In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import zipfile
import urllib3
from urllib3.util import Retry
from dotenv import load_dotenv

Load your environment variables

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

Let the environment know where bcftools is located.

In [None]:
os.environ['BCFTOOLS_PLUGINS'] = os.path.expanduser('~/.local/libexec/bcftools')

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab4_log.txt")
print(f"The Lab 4 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

In [None]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

# Genetic Maps

Genetic maps, also known as recombination maps, are essential tools that illustrate the relative positions of genetic markers (such as single nucleotide polymorphisms, or SNPs) along a chromosome. Unlike physical maps that measure distances in base pairs, genetic maps measure distances in centiMorgans (cM), where one centiMorgan represents a 1% probability of recombination between markers during meiosis.

## Key Components of Genetic Maps

- **Markers:**  
  Identifiable DNA sequences used as reference points on the genome.

- **Recombination Frequency:**  
  The probability of a recombination event occurring between markers, which informs the genetic distances.

- **Map Distance:**  
  Expressed in centiMorgans (cM), reflecting the likelihood of recombination rather than the physical distance.

## BEAGLE's Genetic Map

BEAGLE is a widely used software package for phasing, genotype imputation, and identity-by-descent (IBD) analysis. Its performance is closely tied to the use of high-resolution genetic maps. Here are some distinctive features of BEAGLE's genetic map:

- **High Marker Density:**  
  The genetic maps provided with BEAGLE include a dense array of markers. This density allows for the precise capture of fine-scale recombination events, which in turn improves the accuracy of haplotype phasing and genotype imputation.

- **Species and Population Specificity:**  
  The maps are often developed from extensive pedigree or population studies. For human genetic studies, they are constructed based on large-scale recombination data, ensuring relevance to the population under study.

- **Integration with Statistical Models:**  
  BEAGLE utilizes these maps within its statistical algorithms to model recombination events effectively. This integration is crucial for accurately inferring missing genotypes and detecting IBD segments.

- **Enhanced Analysis Accuracy:**  
  The detailed recombination information in BEAGLE's genetic maps allows for better adjustment for linkage disequilibrium and recombination rates, ultimately leading to more robust downstream genetic analyses.

## Benefits of Using BEAGLE's Genetic Map

- **Improved Phasing Accuracy:**  
  The high-resolution data facilitates precise haplotype reconstruction, reducing errors in phase determination.

- **Robust Genotype Imputation:**  
  Detailed recombination rate data enhances the accuracy of imputing missing genotypes, ensuring more reliable datasets.

- **Streamlined Analysis Workflow:**  
  The genetic map is specifically tailored to integrate seamlessly with BEAGLE’s algorithms, thereby optimizing the overall analysis process.

## References

1. Browning, B. L., & Browning, S. R. (2007). *Rapid and Accurate Haplotype Phasing and Missing-Data Inference for Whole-Genome Association Studies by Use of Localized Haplotype Clustering*. [American Journal of Human Genetics](https://www.cell.com/AJHG/fulltext/S0002-9297(07)63882-8)
2. Browning, B. L., Zhou, Y., & Browning, S. R. (2018). *A One-Penny Imputed Genome from Next-Generation Reference Panels*. [American Journal of Human Genetics](https://pubmed.ncbi.nlm.nih.gov/30100085/)
3. [BEAGLE Documentation](https://faculty.washington.edu/browning/beagle/beagle.html)
4. [NHGRI Glossary: Genetic Map](https://www.genome.gov/genetics-glossary/Genetic-Map)
5. Li, Y., Willer, C., Sanna, S., & Abecasis, G. (2009). *Genotype Imputation*. [Annual Review of Genomics and Human Genetics](https://www.annualreviews.org/content/journals/10.1146/annurev.genom.9.081307.164242)

### Download Genetic Maps (Beagle's plink version)

Define download helper functions

In [None]:
def create_session_with_retries():
    """Create a requests session with retry strategy"""
    retry_strategy = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504]
    )
    http = urllib3.PoolManager(retries=retry_strategy)
    return http

session = create_session_with_retries()

def download_with_progress(url, output_path):
    """Download file with progress tracking"""
    response = session.request('GET', url, preload_content=False)

    if response.status != 200:
        raise Exception(f"HTTP error occurred: {response.status} {response.reason}")
    
    total_size = int(response.headers.get('content-length', 0))
    block_size = 8192
    progress_increment = max(1, total_size // 50) if total_size > 0 else block_size
    
    with open(output_path, 'wb') as f:
        downloaded = 0
        last_print = 0
        while True:
            chunk = response.read(block_size)
            if not chunk:
                break
                
            f.write(chunk)
            downloaded += len(chunk)
            if total_size > 0 and downloaded - last_print >= progress_increment:
                last_print = downloaded
                progress = (downloaded / total_size) * 100
                logging.info(f"Download progress: {progress:.1f}%")
                
    response.release_conn()

Download the genetic map files from Beagle

In [None]:
def download_from_beagle(assembly, output_directory):
    """Downloads and processes Beagle genetic maps"""
    assembly_map_files = {
        "GRCh36": "plink.GRCh36.map.zip",
        "GRCh37": "plink.GRCh37.map.zip",
        "GRCh38": "plink.GRCh38.map.zip"
    }

    if assembly not in assembly_map_files:
        raise ValueError(f"Unsupported assembly '{assembly}'. Must be one of {list(assembly_map_files.keys())}.")

    file_name = assembly_map_files[assembly]
    url = f"https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/{file_name}"
    output_path = os.path.join(output_directory, file_name)

    try:
        logging.info(f"Downloading Beagle map from {url}")
        download_with_progress(url, output_path)
        
        # Verify zip file integrity
        try:
            with zipfile.ZipFile(output_path, 'r') as zip_ref:
                # Test zip file before extraction
                test_result = zip_ref.testzip()
                if test_result is not None:
                    raise zipfile.BadZipFile(f"Corrupted file found in ZIP: {test_result}")
                
                # Extract files
                logging.info(f"Extracting files to {output_directory}")
                zip_ref.extractall(output_directory)
                
                # Log extracted files
                extracted_files = zip_ref.namelist()
                logging.info(f"Extracted {len(extracted_files)} files: {', '.join(extracted_files)}")
        
        except zipfile.BadZipFile as e:
            raise Exception(f"Invalid ZIP file downloaded: {str(e)}")
        
        # Clean up zip file
        os.remove(output_path)
        logging.info("ZIP file cleaned up")
        
    except Exception as e:
        logging.error(f"Error processing Beagle files: {str(e)}")
        if os.path.exists(output_path):
            os.remove(output_path)
        raise

In [None]:
# Set up output directories
genetic_maps_directory = os.path.join(references_directory, "genetic_maps")
os.makedirs(genetic_maps_directory, exist_ok=True)

beagle_genetic_maps = os.path.join(genetic_maps_directory, "beagle_genetic_maps")
os.makedirs(beagle_genetic_maps, exist_ok=True)

assembly = "GRCh38"
output_directory = beagle_genetic_maps

# Download Beagle genetic maps
download_from_beagle(assembly, output_directory)

#### New Resource

Take a look at your `genetic_maps` directory. You should see the `beagle_genetic_maps` directory. Within `beagle_genetic_maps`, you should see your genetic map files, one for each chromosome. The naming convention is `plink.chr{chromosome_number}.GRCh38.map`.

# VCF Quality Control and Processing Pipeline

This script implements a comprehensive quality control (QC) and processing pipeline for merged VCF files, designed specifically for downstream genetic analyses (e.g., genetic genealogy). The pipeline integrates several tools (e.g., PLINK2, bcftools, Beagle) to perform quality control, filtering, and conversion of VCF files into other formats.

## Explore the Code Chunks

#### Validate the Merged VCF file as input

In [None]:
import subprocess

def validate_merged_vcf(vcf_path):
    """Validate merged VCF and extract available chromosomes."""
    cmd_counts = ["bcftools", "plugin", "counts", vcf_path]
    result_counts = subprocess.run(cmd_counts, capture_output=True, text=True, check=True)
    logging.info(f"Plugin 'counts' validation output for {vcf_path}:\n{result_counts.stdout}")
    if result_counts.stderr:
        logging.info(f"Plugin 'counts' validation errors:\n{result_counts.stderr}")

    num_samples = 0
    for line in result_counts.stdout.splitlines():
        if line.startswith("Number of samples:"):
            parts = line.split(":")
            if len(parts) == 2:
                num_samples = int(parts[1].strip())
    if not num_samples:
        logging.error(f"No sample count found in VCF file: {vcf_path}")

    num_snps = 0
    for line in result_counts.stdout.splitlines():
        if line.startswith("Number of SNPs:"):
            parts = line.split(":")
            if len(parts) == 2:
                num_snps = int(parts[1].strip())
    if not num_snps:
        logging.error(f"No sample count found in VCF file: {vcf_path}")


    logging.info("Extracting list of chromosomes from the VCF header.")
    cmd_chrom_contig = f"bcftools view -h {vcf_path} | grep '^##contig' | cut -d'=' -f3 | cut -d',' -f1"
    result_chrom_contig = subprocess.run(cmd_chrom_contig, shell=True, capture_output=True, text=True, check=True)
    chromosomes_contig = result_chrom_contig.stdout.splitlines()
    if not chromosomes_contig:
        logging.error(f"No chromosomes found in VCF file: {vcf_path}")
    else:
        logging.debug(f"Chromosomes found in VCF file header: {', '.join(chromosomes_contig)}")


    logging.info("Extracting a list of chromosomes from the CHROM column..")
    cmd_chrom_field = f"bcftools query -f '%CHROM\n' {vcf_path} | sort -u"
    result_chrom_field = subprocess.run(cmd_chrom_field, shell=True, capture_output=True, text=True, check=True)
    chromosomes_field = result_chrom_field.stdout.splitlines()
    if not chromosomes_field:
        logging.error(f"No chromosomes found in VCF file in the CHROM field: {vcf_path}")
    else:
        logging.debug(f"Chromosomes found in VCF file in the CHROM field: {', '.join(chromosomes_field)}")


    if chromosomes_contig != chromosomes_field:
        logging.error("Mismatch between chromosomes in contig and field headers.")
        logging.error(f"Contig chromosomes: {chromosomes_contig}")
        logging.error(f"Field chromosomes: {chromosomes_field}")


    logging.info("Extracting sample IDs from the VCF file.")
    cmd_sample_list = ["bcftools", "query", "-l", vcf_path]
    result_sample_list = subprocess.run(cmd_sample_list, capture_output=True, text=True, check=True)
    sample_ids = result_sample_list.stdout.splitlines()

    if not sample_ids:
        logging.error(f"No sample IDs found in VCF file: {vcf_path}")
    else:
        logging.debug(f"Sample IDs found in VCF file: {', '.join(sample_ids)}")

    return num_samples, num_snps, chromosomes_field, sample_ids

In [None]:
# VCF created in Lab3 Get Raw DNA Profile
vcf_path = f"{data_directory}/class_data/merged_opensnps_data.vcf.gz"
print(vcf_path)
num_samples, num_snps, chromosomes, sample_ids = validate_merged_vcf(vcf_path)
print(num_samples, num_snps, chromosomes, sample_ids)

**Note:** The ERROR between the Contig chromosomes and Field chromosomes are okay for now. Try to see why there is an error here.

#### Prepare the Supplemental Data

In [None]:
def parse_sex_determination(determined_sex_file, failed_sex):
    """Parse the sex determination log and create a mapping of user IDs to sexes."""
    sex_mapping = {}
    with open(determined_sex_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "1" if sex == "Male" else "2"

    with open(failed_sex, 'r') as p:
        lines = p.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "0" # Unknown sex

    # Count occurrences of each sex code
    counts = Counter(sex_mapping.values())

    # Print results
    logging.info(f"Count of SEX=0 (Unknown): {counts['0']}")
    logging.info(f"Count of SEX=1 (Male): {counts['1']}")
    logging.info(f"Count of SEX=2 (Female): {counts['2']}")

    return sex_mapping

def write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file):
    """Write both PLINK2-compatible .psam files and sex update file."""
    
    # Reorder sex_mapping based on sample_ids
    ordered_sex_mapping = {sample_id: sex_mapping.get(sample_id, "0") for sample_id in sample_ids}
    
    # Write standard .psam file for all chromosomes
    with open(psam_file_all, 'w') as f:
        f.write("#FID\tIID\tSEX\n")  # Header for .psam file
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write .psam file for Y chromosome (males only)
    with open(psam_file_Y, 'w') as f:
        f.write("#FID\tIID\tSEX\n")
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code != "1":
                continue  # Exclude non-males
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write sex update file for PLINK2 --update-sex
    with open(sex_update_file, 'w') as f:
        f.write("#IID\tSEX\n")  # PLINK2 format for sex update
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{sex_code}\n")

In [None]:
determined_sex_file = f"{data_directory}/class_data/determined_sex.txt"
failed_sex = f"{data_directory}/class_data/failed_sex.txt"

sex_mapping = parse_sex_determination(determined_sex_file, failed_sex)
base_name = os.path.splitext(determined_sex_file)[0]
psam_file_all = f"{base_name}_all.psam"
psam_file_Y = f"{base_name}_Y.psam"
sex_update_file = f"{base_name}_update_sex.txt"
write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file)

**Quality Control Filtering:**  
   Applies a series of default QC filters to the VCF file:
   - **Autosomal Filtering:** Only autosomal SNPs are retained (excludes X and Y chromosomes to simplify analysis).
   - **Duplicate Removal:** Duplicated SNPs are removed (keeps the first occurrence).
   - **VCF Half-Call Handling:** Half-calls are treated as missing.
   - **SNP Filtering:** Keeps only SNPs with nucleotide calls in {A, C, G, T} (ignoring case).
   - **Biallelic SNPs:** Filters for SNPs with exactly two alleles.
   - **Genotype Missingness (`--geno`):** Excludes SNPs with a missingness rate exceeding the specified threshold.
   - **Minor Allele Frequency (`--maf`):** Excludes SNPs with a frequency lower than the specified threshold.

**Data Conversion and Filtering:**  
   - **Step 1:** Converts the VCF file into PLINK format using `plink2` with QC parameters.
   - **Step 2:** Filters the PLINK data by genotype missingness and minor allele frequency.
   - **Step 3:** Splits the dataset by chromosome and exports each chromosome’s data as a VCF file. 

#### Quality Control Step 1

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

#The first step of the pipeline involves parsing command-line arguments and applying default quality control parameters to the merged VCF file. These filters include:

# - **Autosomal Filtering:** Only autosomal SNPs are retained.
# - **Duplicate Removal:** Duplicate SNPs are removed (keeping the first occurrence).
# - **VCF Half-Call Handling:** Half-calls are treated as missing.
# - **SNP Filtering:** Only SNPs with nucleotide types {A, C, G, T} are kept.
# - **Biallelic SNPs:** Only SNPs with exactly two alleles are retained.
# - **Genotype Missingness (`--geno`):** Excludes SNPs with a missingness rate above a threshold.
# - **Minor Allele Frequency (`--maf`):** Excludes SNPs with a frequency below a threshold.

# Change to the utils directory if necessary
cd "${utils_directory}"


plink2 --vcf ${data_directory}/class_data/merged_opensnps_data.vcf.gz \
  --autosome \
  --snps-only just-acgt \
  --rm-dup exclude-all \
  --min-alleles 2 \
  --max-alleles 2 \
  --make-pgen \
  --out ${results_directory}/merged_opensnps_autosomes_step1


#### Quality Control Step 2

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Change to the utils directory if necessary
cd "${utils_directory}"

echo "Filtering by genotype missingness (geno=${geno}) and minor allele frequency (maf=${maf})..."
plink2 --pfile ${results_directory}/merged_opensnps_autosomes_step1 \
  --geno .05 \
  --maf .05 \
  --sort-vars \
  --make-pgen \
  --out ${results_directory}/merged_opensnps_autosomes_step2

#### Quality Control Step 3

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"
sample_file="merged_opensnps"

# The input prefix is the output from step 2
input_prefix="${results_directory}/merged_opensnps_autosomes_step2"

echo "Splitting by chromosome and exporting as VCF..."

# Loop over autosomal chromosomes 1 through 22
for chromosome in {1..22}; do
    echo "Processing chromosome ${chromosome}..."
    
    # Define the output prefix for the current chromosome
    output_prefix="${results_directory}/${sample_file}_qcstart_chr${chromosome}"
    
    # Export the chromosome-specific data as a VCF using plink2
    plink2 --pfile "$input_prefix" \
           --chr "${chromosome}" \
           --export vcf \
           --out "$output_prefix"
    
    # Define the input and output for bcftools processing
    bcftools_input="${output_prefix}.vcf"
    bcftools_output="${results_directory}/${sample_file}_qcfinished_chr${chromosome}.vcf.gz"
    
    echo "Filtering for biallelic variants on chromosome ${chromosome}..."
    # Filter for biallelic variants and compress the VCF using bcftools
    bcftools view -m2 -M2 -Oz -o "$bcftools_output" "$bcftools_input"
    
    echo "Indexing the filtered VCF for chromosome ${chromosome}..."
    # Index the compressed VCF file
    bcftools index "$bcftools_output"
done


**Phasing:**  
   - After QC and filtering, the script calls an external shell script (`phase_chromosomes.sh`) to phase chromosomes using Beagle.  
   - The script passes an input file prefix (derived from the processed VCF file) along with directory paths and the Beagle JAR file for further phasing operations.

### Phase the data

In [None]:
%%bash -s "$data_directory" "$references_directory" "$results_directory" "$utils_directory"

data_directory="$1"
references_directory="$2"
results_directory="$3"
utils_directory="$4"
sample_file="merged_opensnps"
input_prefix="${sample_file}_qcfinished"
phased_directory="${results_directory}/phased_samples"
beagle="${utils_directory}/beagle.17Dec24.224.jar"

# Create the phased directory if it does not exist
mkdir -p "$phased_directory"

# Phase chromosomes using Beagle
for CHR in {1..22}; do
    echo "Processing chromosome $CHR"

    INPUT_VCF="${results_directory}/${input_prefix}_chr${CHR}.vcf.gz"
    REF_VCF="${references_directory}/onethousandgenomes_genotype/onethousandgenomes_genotyped_phased.chr${CHR}.vcf.gz"
    MAP_FILE="${references_directory}/genetic_maps/beagle_genetic_maps/plink.chr${CHR}.GRCh38.map"
    OUTPUT_PREFIX="${phased_directory}/merged_opensnps_phased_chr${CHR}"
    PHASED_VCF="${OUTPUT_PREFIX}.vcf.gz"

    # Check if input VCF exists
    if [ ! -f "$INPUT_VCF" ]; then
        echo "Input VCF file not found for chromosome $CHR. Skipping."
        echo "$INPUT_VCF"
        continue
    fi

    if [ -f "$REF_VCF" ]; then
        # Run Beagle with reference file
        java -jar ${beagle} \
            gt="$INPUT_VCF" \
            ref="$REF_VCF" \
            map="$MAP_FILE" \
            out="$OUTPUT_PREFIX"
    else
        echo "Note: The reference file does not exist; the file is phased based on no reference panel."
        # Run Beagle without reference file
        java -jar ${beagle} \
            gt="$INPUT_VCF" \
            map="$MAP_FILE" \
            out="$OUTPUT_PREFIX"
    fi
    
    if [ $? -ne 0 ]; then
        echo "Beagle failed for chromosome $CHR. Skipping."
        continue
    fi

    if [ ! -f "$PHASED_VCF" ]; then
        echo "Phasing failed for chromosome $CHR. Skipping."
        continue
    fi

    # Sort and index the phased VCF
    echo "Sorting and indexing phased VCF for chromosome $CHR"
    tabix -p vcf "$PHASED_VCF"
    SORTED_VCF="${phased_directory}/merged_opensnps_phased_chr${CHR}_sorted.vcf.gz"
    bcftools sort -Oz -o "$SORTED_VCF" "$PHASED_VCF"
    tabix -p vcf "$SORTED_VCF"

    # Replace original VCF with sorted version
    mv "$SORTED_VCF" "$PHASED_VCF"
    mv "${SORTED_VCF}.tbi" "${PHASED_VCF}.tbi"
done

### Let's generate some stats on our files to manually inspect them.

In [None]:
%%bash -s "$data_directory" "$results_directory"

data_directory="$1"
results_directory="$2"
phased_directory="${results_directory}/phased_samples"

# Generate stats for each chromosome
for CHR in {1..22}; do
    PHASED_VCF="${phased_directory}/merged_opensnps_phased_chr${CHR}.vcf.gz"
    if [ -f "$PHASED_VCF" ]; then
        STATS_OUTPUT="${phased_directory}/merged_opensnps_phased_chr${CHR}_stats.vchk"
        bcftools stats -s - "$PHASED_VCF" > "$STATS_OUTPUT"
        echo "Stats generated for chromosome $CHR. See: $STATS_OUTPUT"
    else
        echo "Phased VCF not found for chromosome $CHR. Skipping stats generation."
    fi
done

When you are ready, run the following cell to delete the intermediary files that were created. The files in your `results/phased_samples` directory will remain.

In [None]:
%%bash -s "$data_directory" "$results_directory"

data_directory="$1"
results_directory="$2"

# Final cleanup of QC files
echo "Cleaning up intermediate QC files"
for CHR in {1..22}; do
    rm -f "${results_directory}/merged_opensnps_qcstart_chr${CHR}.vcf"
    rm -f "${results_directory}/merged_opensnps_qcstart_chr${CHR}.log"
    rm -f "${results_directory}/merged_opensnps_qcfinished_chr${CHR}.vcf.gz"
    rm -f "${results_directory}/merged_opensnps_qcfinished_chr${CHR}.log"
    rm -f "${results_directory}/merged_opensnps_qcfinished_chr${CHR}.vcf.gz.csi"
done

rm -f "${results_directory}/merged_opensnps_autosomes_step1.log"
rm -f "${results_directory}/merged_opensnps_autosomes_step1.pgen"
rm -f "${results_directory}/merged_opensnps_autosomes_step1.psam"
rm -f "${results_directory}/merged_opensnps_autosomes_step1.pvar"
rm -f "${results_directory}/merged_opensnps_autosomes_step2.log"
rm -f "${results_directory}/merged_opensnps_autosomes_step2.pgen"
rm -f "${results_directory}/merged_opensnps_autosomes_step2.psam"
rm -f "${results_directory}/merged_opensnps_autosomes_step2.pvar"    