In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import zipfile
import urllib3
from urllib3.util import Retry
from dotenv import load_dotenv

Load your environment variables

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

os.environ["WORKING_DIRECTORY"] = working_directory
os.environ["DATA_DIRECTORY"] = data_directory
os.environ["REFERENCES_DIRECTORY"] = references_directory
os.environ["RESULTS_DIRECTORY"] = results_directory
os.environ["UTILS_DIRECTORY"] = utils_directory

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Let the environment know where bcftools is located.

In [None]:
os.environ["BCFTOOLS_PLUGINS"] = "/usr/lib/x86_64-linux-gnu/bcftools"

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        
log_filename = os.path.join(results_directory, "lab3_log.txt")
print(f"The Lab 3 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

In [None]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

# Genetic Maps

Genetic maps, also known as recombination maps, are essential tools that illustrate the relative positions of genetic markers (such as single nucleotide polymorphisms, or SNPs) along a chromosome. Unlike physical maps that measure distances in base pairs, genetic maps measure distances in centiMorgans (cM), where one centiMorgan represents a 1% probability of recombination between markers during meiosis.

## Key Components of Genetic Maps

- **Markers:**  
  Identifiable DNA sequences used as reference points on the genome.

- **Recombination Frequency:**  
  The probability of a recombination event occurring between markers, which informs the genetic distances.

- **Map Distance:**  
  Expressed in centiMorgans (cM), reflecting the likelihood of recombination rather than the physical distance.

## BEAGLE's Genetic Map

BEAGLE is a widely used software package for phasing, genotype imputation, and identity-by-descent (IBD) analysis. Its performance is closely tied to the use of high-resolution genetic maps. Here are some distinctive features of BEAGLE's genetic map:

- **High Marker Density:**  
  The genetic maps provided with BEAGLE include a dense array of markers. This density allows for the precise capture of fine-scale recombination events, which in turn improves the accuracy of haplotype phasing and genotype imputation.

- **Species and Population Specificity:**  
  The maps are often developed from extensive pedigree or population studies. For human genetic studies, they are constructed based on large-scale recombination data, ensuring relevance to the population under study.

- **Integration with Statistical Models:**  
  BEAGLE utilizes these maps within its statistical algorithms to model recombination events effectively. This integration is crucial for accurately inferring missing genotypes and detecting IBD segments.

- **Enhanced Analysis Accuracy:**  
  The detailed recombination information in BEAGLE's genetic maps allows for better adjustment for linkage disequilibrium and recombination rates, ultimately leading to more robust downstream genetic analyses.

## Benefits of Using BEAGLE's Genetic Map

- **Improved Phasing Accuracy:**  
  The high-resolution data facilitates precise haplotype reconstruction, reducing errors in phase determination.

- **Robust Genotype Imputation:**  
  Detailed recombination rate data enhances the accuracy of imputing missing genotypes, ensuring more reliable datasets.

- **Streamlined Analysis Workflow:**  
  The genetic map is specifically tailored to integrate seamlessly with BEAGLE’s algorithms, thereby optimizing the overall analysis process.

## References

1. Browning, B. L., & Browning, S. R. (2007). *Rapid and Accurate Haplotype Phasing and Missing-Data Inference for Whole-Genome Association Studies by Use of Localized Haplotype Clustering*. [American Journal of Human Genetics](https://www.cell.com/AJHG/fulltext/S0002-9297(07)63882-8)
2. Browning, B. L., Zhou, Y., & Browning, S. R. (2018). *A One-Penny Imputed Genome from Next-Generation Reference Panels*. [American Journal of Human Genetics](https://pubmed.ncbi.nlm.nih.gov/30100085/)
3. [BEAGLE Documentation](https://faculty.washington.edu/browning/beagle/beagle.html)
4. [NHGRI Glossary: Genetic Map](https://www.genome.gov/genetics-glossary/Genetic-Map)
5. Li, Y., Willer, C., Sanna, S., & Abecasis, G. (2009). *Genotype Imputation*. [Annual Review of Genomics and Human Genetics](https://www.annualreviews.org/content/journals/10.1146/annurev.genom.9.081307.164242)

### Download Genetic Maps (Beagle's plink version)

Define download helper functions

In [None]:
def create_session_with_retries():
    """Create a requests session with retry strategy"""
    retry_strategy = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504]
    )
    http = urllib3.PoolManager(retries=retry_strategy)
    return http

session = create_session_with_retries()

def download_with_progress(url, output_path):
    """Download file with progress tracking"""
    response = session.request('GET', url, preload_content=False)

    if response.status != 200:
        raise Exception(f"HTTP error occurred: {response.status} {response.reason}")
    
    total_size = int(response.headers.get('content-length', 0))
    block_size = 8192
    progress_increment = max(1, total_size // 50) if total_size > 0 else block_size
    
    with open(output_path, 'wb') as f:
        downloaded = 0
        last_print = 0
        while True:
            chunk = response.read(block_size)
            if not chunk:
                break
                
            f.write(chunk)
            downloaded += len(chunk)
            if total_size > 0 and downloaded - last_print >= progress_increment:
                last_print = downloaded
                progress = (downloaded / total_size) * 100
                logging.info(f"Download progress: {progress:.1f}%")
                
    response.release_conn()

Download the genetic map files from Beagle

⚠️ Long-Running Operation: Download genetic maps for PLINK

The next cell downloads genetic maps for multiple chromosomes. This operation:

- Downloads approximately 120 MB of genetic map data
- May take 5-10 minutes depending on your internet connection
- Creates a set of genetic map files required for IBD detection

If you already have these files in your references directory, this step will verify and skip files that already exist.

In [None]:
# Set up output directories
genetic_maps_directory = os.path.join(references_directory, "genetic_maps")
os.makedirs(genetic_maps_directory, exist_ok=True)

beagle_genetic_maps = os.path.join(genetic_maps_directory, "beagle_genetic_maps")
os.makedirs(beagle_genetic_maps, exist_ok=True)

# Download BEAGLE genetic maps
!poetry run python -m scripts_support.genetic_maps_download --data-source BEAGLE --assembly GRCh38

#### New Resource

Take a look at your `genetic_maps` directory. You should see the `beagle_genetic_maps` directory. Within `beagle_genetic_maps`, you should see your genetic map files, one for each chromosome. The naming convention is `plink.chr{chromosome_number}.GRCh38.map`.

# VCF Quality Control and Processing Pipeline

This script implements a comprehensive quality control (QC) and processing pipeline for merged VCF files, designed specifically for downstream genetic analyses (e.g., genetic genealogy). The pipeline integrates several tools (e.g., PLINK2, bcftools, Beagle) to perform quality control, filtering, and conversion of VCF files into other formats.

## Explore the Code Chunks

#### Validate the Merged VCF file as input

If you are returning back to this point after running subsequent labs, you need the results from the ped-sim notebook. Alternatively (e.g, if you don't have those results), run the following cell to copy prepared results from the instructor's run of the ped-sim notebook.

In [None]:
# Copy required files from class_data to results directory
import os
import shutil

# List of files to copy
files_to_copy = [
    "ped_sim_run2-everyone.fam",
    "ped_sim_run2.seg",
    "ped_sim_run2.seg_dict.txt",
    "pedigree.fam"
]

# Copy each file
for file in files_to_copy:
    source = os.path.join(data_directory, "class_data", file)
    destination = os.path.join(results_directory, file)
    shutil.copy2(source, destination)
    print(f"Copied {file} to {results_directory}")

**Select your VCF file**

In the next cell, uncomment the file you want to use.

In [None]:
# Specify the VCF file and directory to use
# Uncomment the VCF file you want to use:

# vcf_file = os.path.join(results_directory, "merged_sample_autosomes_unphased.vcf.gz")
# vcf_directory = os.path.join(results_directory, "real_data_autosomes")

vcf_file = os.path.join(data_directory, "class_data", "merged_opensnps_data.vcf.gz")
vcf_directory = os.path.join(data_directory, "class_data", "merged_opensnps_data_autosomes")

# vcf_file = os.path.join(results_directory, "ped_sim_run2.vcf.gz")
# vcf_directory = os.path.join(results_directory, "ped_sim_run2_autosomes")

# Check if files and directories exist
if not os.path.exists(vcf_file):
    print(f"⚠️ Warning: The VCF file does not exist at {vcf_file}")
    print("Please check the path or run the previous labs to generate this file.")
    
# Ensure the output directory exists
if not os.path.exists(vcf_directory):
    print(f"Creating output directory: {vcf_directory}")
    os.makedirs(vcf_directory, exist_ok=True)

In [None]:
import subprocess

def validate_merged_vcf(vcf_path):
    """Validate merged VCF and extract available chromosomes."""
    if not os.path.exists(vcf_path):
        logging.error(f"VCF file does not exist: {vcf_path}")
        return 0, 0, [], []
        
    try:
        cmd_counts = ["bcftools", "plugin", "counts", vcf_path]
        result_counts = subprocess.run(cmd_counts, capture_output=True, text=True, check=True)
        logging.info(f"Plugin 'counts' validation output for {vcf_path}:\n{result_counts.stdout}")
        if result_counts.stderr:
            logging.info(f"Plugin 'counts' validation errors:\n{result_counts.stderr}")

        num_samples = 0
        for line in result_counts.stdout.splitlines():
            if line.startswith("Number of samples:"):
                parts = line.split(":")
                if len(parts) == 2:
                    num_samples = int(parts[1].strip())
        if not num_samples:
            logging.error(f"No sample count found in VCF file: {vcf_path}")

        num_snps = 0
        for line in result_counts.stdout.splitlines():
            if line.startswith("Number of SNPs:"):
                parts = line.split(":")
                if len(parts) == 2:
                    num_snps = int(parts[1].strip())
        if not num_snps:
            logging.error(f"No sample count found in VCF file: {vcf_path}")


        logging.info("Extracting list of chromosomes from the VCF header.")
        cmd_chrom_contig = f"bcftools view -h {vcf_path} | grep '^##contig' | cut -d'=' -f3 | cut -d',' -f1"
        result_chrom_contig = subprocess.run(cmd_chrom_contig, shell=True, capture_output=True, text=True, check=True)
        chromosomes_contig = result_chrom_contig.stdout.splitlines()
        if not chromosomes_contig:
            logging.warning(f"No chromosomes found in VCF file contig headers: {vcf_path}")
        else:
            logging.debug(f"Chromosomes found in VCF file header: {', '.join(chromosomes_contig)}")


        logging.info("Extracting a list of chromosomes from the CHROM column..")
        cmd_chrom_field = f"bcftools query -f '%CHROM\n' {vcf_path} | sort -u"
        result_chrom_field = subprocess.run(cmd_chrom_field, shell=True, capture_output=True, text=True, check=True)
        chromosomes_field = result_chrom_field.stdout.splitlines()
        if not chromosomes_field:
            logging.error(f"No chromosomes found in VCF file in the CHROM field: {vcf_path}")
        else:
            logging.debug(f"Chromosomes found in VCF file in the CHROM field: {', '.join(chromosomes_field)}")


        if chromosomes_contig and chromosomes_field and chromosomes_contig != chromosomes_field:
            logging.warning("Mismatch between chromosomes in contig and field headers.")
            logging.warning(f"Contig chromosomes: {chromosomes_contig}")
            logging.warning(f"Field chromosomes: {chromosomes_field}")


        logging.info("Extracting sample IDs from the VCF file.")
        cmd_sample_list = ["bcftools", "query", "-l", vcf_path]
        result_sample_list = subprocess.run(cmd_sample_list, capture_output=True, text=True, check=True)
        sample_ids = result_sample_list.stdout.splitlines()

        if not sample_ids:
            logging.error(f"No sample IDs found in VCF file: {vcf_path}")
        else:
            logging.debug(f"Sample IDs found in VCF file: {', '.join(sample_ids)}")

        return num_samples, num_snps, chromosomes_field, sample_ids
        
    except subprocess.CalledProcessError as e:
        logging.error(f"Error validating VCF file {vcf_path}: {e}")
        logging.error(f"Command output: {e.stdout}")
        logging.error(f"Command error: {e.stderr}")
        return 0, 0, [], []
    except Exception as e:
        logging.error(f"Unexpected error validating VCF file {vcf_path}: {e}")
        return 0, 0, [], []


# VCF created in Lab3 Get Raw DNA Profile
if os.path.exists(vcf_file):
    num_samples, num_snps, chromosomes, sample_ids = validate_merged_vcf(vcf_file)
    print(f"VCF summary: {num_samples} samples, {num_snps} SNPs")
    print(f"Chromosomes: {chromosomes}")
    print(f"Sample IDs: {', '.join(sample_ids) if len(sample_ids) < 10 else f'{len(sample_ids)} samples'}")
else:
    print(f"⚠️ Cannot validate VCF file: {vcf_file} does not exist")
    print("Please check the file path or run the previous labs to generate this file.")

**Note:** The ERROR between the Contig chromosomes and Field chromosomes are okay for now. Try to see why there is an error here.

#### Prepare the Supplemental Data

In [None]:
def parse_sex_determination(determined_sex_file, failed_sex):
    """Parse the sex determination log and create a mapping of user IDs to sexes."""
    sex_mapping = {}
    with open(determined_sex_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "1" if sex == "Male" else "2"

    with open(failed_sex, 'r') as p:
        lines = p.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "0" # Unknown sex

    # Count occurrences of each sex code
    counts = Counter(sex_mapping.values())

    # Print results
    logging.info(f"Count of SEX=0 (Unknown): {counts['0']}")
    logging.info(f"Count of SEX=1 (Male): {counts['1']}")
    logging.info(f"Count of SEX=2 (Female): {counts['2']}")

    return sex_mapping

def write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file):
    """Write both PLINK2-compatible .psam files and sex update file."""
    
    # Reorder sex_mapping based on sample_ids
    ordered_sex_mapping = {sample_id: sex_mapping.get(sample_id, "0") for sample_id in sample_ids}
    
    # Write standard .psam file for all chromosomes
    with open(psam_file_all, 'w') as f:
        f.write("#FID\tIID\tSEX\n")  # Header for .psam file
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write .psam file for Y chromosome (males only)
    with open(psam_file_Y, 'w') as f:
        f.write("#FID\tIID\tSEX\n")
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code != "1":
                continue  # Exclude non-males
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write sex update file for PLINK2 --update-sex
    with open(sex_update_file, 'w') as f:
        f.write("#IID\tSEX\n")  # PLINK2 format for sex update
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{sex_code}\n")
            

determined_sex_file = f"{data_directory}/class_data/determined_sex.txt"
failed_sex = f"{data_directory}/class_data/failed_sex.txt"

sex_mapping = parse_sex_determination(determined_sex_file, failed_sex)
base_name = os.path.splitext(determined_sex_file)[0]
psam_file_all = f"{base_name}_all.psam"
psam_file_Y = f"{base_name}_Y.psam"
sex_update_file = f"{base_name}_update_sex.txt"
write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file)

⚠️ Long-Running Operation: Perform quality control on VCF files

The next cell performs extensive quality control operations on the VCF files, including:

- Filtering SNPs based on various criteria
- Removing duplicate variants
- Filtering on minor allele frequency
- Sorting and indexing variants

This process runs on **all chromosomes** and may take **15-30 minutes** to complete depending on your system performance. You'll see progress updates for each chromosome as it processes.

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
base_name=$(basename "$output_prefix")
bgzip -d "$vcf_file"
bgzip "${output_prefix}.vcf"

unphased_samples_directory="${vcf_directory}/unphased_samples"
mkdir -p "$unphased_samples_directory"

# Define input and output files
input_vcf="${vcf_file}"
temp_prefix="${results_directory}/temp"
bcftools index -t -f "$input_vcf"

for chromosome in {1..22}; do
    echo "Processing chromosome ${chromosome}..."
    
    output_vcf="${vcf_directory}/unphased_samples/${base_name}_qcfinished_chr${chromosome}.vcf.gz"
    
    # Extended QC pipeline:
    # 1. Select autosomal chromosome
    # 2. Keep only biallelic SNPs
    # -m2 keeps only variants with at least 2 alleles
    # -M2 keeps only variants with at most 2 alleles
    # could add: -i 'strlen(REF)=1 && strlen(ALT)=1' | \
    # 3. Remove exact duplicate variants
    # 4. Filter on MAF and missing data
    # 5. Sort variants
    bcftools view "$input_vcf" \
        --regions "${chromosome}" \
        --types snps \
        -m2 -M2 \
        -i 'strlen(REF)=1 && strlen(ALT)=1' | \
    bcftools norm --rm-dup exact | \
    bcftools view \
        -q 0.05:minor \
        -i 'F_MISSING < 0.05' | \
    bcftools sort -Oz -o "$output_vcf"
    
    # Index the final VCF with force flag
    bcftools index -f "$output_vcf"
    
    # Report number of variants
    echo "Number of variants in chromosome ${chromosome} after QC:"
    bcftools index -n "$output_vcf"
    echo
done

### Phase the data

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"
echo "vcf_file: $vcf_file"
echo "vcf_directory: $vcf_directory"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

unphased_directory="${vcf_directory}/unphased_samples"
phased_directory="${vcf_directory}/phased_samples"
beagle="${UTILS_DIRECTORY}/beagle.17Dec24.224.jar"

# Create the phased directory if it does not exist
mkdir -p "$phased_directory"

# Phase chromosomes using Beagle
for chr in {1..22}; do
    echo "Processing chromosome ${chr}"

    INPUT_VCF="${unphased_directory}/${base_name}_qcfinished_chr${chr}.vcf.gz"
    REF_VCF="${REFERENCES_DIRECTORY}/onethousandgenomes_genotype/onethousandgenomes_genotyped_phased.chr${chr}.vcf.gz"
    MAP_FILE="${REFERENCES_DIRECTORY}/genetic_maps/beagle_genetic_maps/plink.chr${chr}.GRCh38.map"
    OUTPUT_PREFIX="${phased_directory}/${base_name}_phased_chr${chr}_temp"
    PHASED_VCF="${OUTPUT_PREFIX}.vcf.gz"
    TEMP_VCF="${phased_directory}/temp_chr${chr}.vcf.gz"
    SORTED_VCF="${phased_directory}/${base_name}_phased_chr${chr}.vcf.gz"

    # Check if input VCF exists
    if [ ! -f "${INPUT_VCF}" ]; then
        echo "Input VCF file not found for chromosome ${chr}. Skipping."
        echo "${INPUT_VCF}"
        continue
    fi

    # Run Beagle phasing
    if [ -f "${REF_VCF}" ]; then
        echo "Running Beagle with reference panel for chromosome ${chr}"
        java -jar ${beagle} \
            gt="${INPUT_VCF}" \
            ref="${REF_VCF}" \
            map="${MAP_FILE}" \
            out="${OUTPUT_PREFIX}" || {
                echo "Beagle failed for chromosome ${chr}. Skipping."
                continue
            }
    else
        echo "Running Beagle without reference panel for chromosome ${chr}"
        java -jar ${beagle} \
            gt="${INPUT_VCF}" \
            map="${MAP_FILE}" \
            out="${OUTPUT_PREFIX}" || {
                echo "Beagle failed for chromosome ${chr}. Skipping."
                continue
            }
    fi

    if [ ! -f "${PHASED_VCF}" ]; then
        echo "Phasing failed for chromosome ${chr}. Output file not found. Skipping."
        continue
    fi

    # Index the file
    tabix -f -p vcf "${PHASED_VCF}"
    
    # Add INFO field definition and sort
    echo "Sorting VCF for chromosome $CHR"
    bcftools annotate --header-lines <(echo '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">') "${PHASED_VCF}" | \
    bcftools sort -Oz -o "${SORTED_VCF}" || {
        echo "Sorting failed for chromosome $CHR"
        continue
    }

    # Index the sorted file
    tabix -f -p vcf "${SORTED_VCF}"
    
    # If the sorted vcf and index exists, remove phased vcf and index
    if [ -f "${SORTED_VCF}" ] && [ -f "${SORTED_VCF}.tbi" ]; then
        rm -f "${PHASED_VCF}"
        rm -f "${PHASED_VCF}.tbi"
        rm -f "${PHASED_VCF}.log"
    fi
done

### Let's generate some stats on our files to manually inspect them.

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"
echo "vcf_file: $vcf_file"
echo "vcf_directory: $vcf_directory"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

phased_directory="${vcf_directory}/phased_samples"

# Generate stats for each chromosome
for chr in {1..22}; do
    PHASED_VCF="${phased_directory}/${base_name}_phased_chr${chr}.vcf.gz"
    echo "PHASED_VCF: ${PHASED_VCF}"
    if [ -f "$PHASED_VCF" ]; then
        STATS_OUTPUT="${phased_directory}/${base_name}_phased_chr${chr}_stats.vchk"
        bcftools stats -s - "$PHASED_VCF" > "$STATS_OUTPUT"
        echo "Stats generated for chromosome $chr. See: $STATS_OUTPUT"
    else
        echo "Phased VCF not found for chromosome $chr. Skipping stats generation."
    fi
done

Concat the by-chromosome files

In [None]:
%%bash -s "$vcf_file" "$vcf_directory"

vcf_file="$1"
vcf_directory="$2"
echo "vcf_file: $vcf_file"
echo "vcf_directory: $vcf_directory"

# Extract the file prefix (removing .vcf.gz extension)
output_prefix="${vcf_file%.vcf.gz}"
# Get base name of the VCF file
base_name=$(basename "$output_prefix")
echo "base_name: ${base_name}"

phased_directory="${vcf_directory}/phased_samples"

merged_vcf="${RESULTS_DIRECTORY}/${base_name}_autosomes.vcf.gz"

# List of sorted VCFs
vcf_list=()
for chr in {1..22}; do
    SORTED_VCF="${phased_directory}/${base_name}_phased_chr${chr}.vcf.gz"
    if [ -f "$SORTED_VCF" ]; then
        vcf_list+=("$SORTED_VCF")
    else
        echo "Missing sorted VCF for chromosome ${chr}, skipping."
        echo "Missing $SORTED_VCF"
    fi
done

# Merge all VCFs
if [ ${#vcf_list[@]} -gt 0 ]; then
    echo "Merging phased VCFs into a single autosomal file..."
    bcftools concat -Oz -o "${merged_vcf}" "${vcf_list[@]}" || { echo "Merging failed."; exit 1; }

    # Index the merged VCF
    tabix -f -p vcf "${merged_vcf}"
    echo "Merged VCF created at ${merged_vcf}"
else
    echo "No VCFs available for merging."
fi
