In [23]:
import os
from collections import Counter
import logging
import sys

# Load environment variables from .env file
from dotenv import load_dotenv
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
env_path = os.path.join(project_root, '.env')
load_dotenv(env_path, override=True)

False

In [18]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Working Directory: /home/ubuntu/computational_genetic_genealogy
Data Directory: /home/ubuntu/computational_genetic_genealogy/data
References Directory: /home/ubuntu/computational_genetic_genealogy/references
Results Directory: /home/ubuntu/computational_genetic_genealogy/results
Utils Directory: /home/ubuntu/computational_genetic_genealogy/utils
The current directory is /home/ubuntu/computational_genetic_genealogy


In [24]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [25]:
log_filename = os.path.join(results_directory, "lab4_log.txt")
print(f"The Lab 4 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

The Lab 4 log file is located at /home/ubuntu/computational_genetic_genealogy/results/lab4_log.txt.


In [26]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

# Genetic Maps

Genetic maps, also known as recombination maps, are essential tools that illustrate the relative positions of genetic markers (such as single nucleotide polymorphisms, or SNPs) along a chromosome. Unlike physical maps that measure distances in base pairs, genetic maps measure distances in centiMorgans (cM), where one centiMorgan represents a 1% probability of recombination between markers during meiosis.

## Key Components of Genetic Maps

- **Markers:**  
  Identifiable DNA sequences used as reference points on the genome.

- **Recombination Frequency:**  
  The probability of a recombination event occurring between markers, which informs the genetic distances.

- **Map Distance:**  
  Expressed in centiMorgans (cM), reflecting the likelihood of recombination rather than the physical distance.

## BEAGLE's Genetic Map

BEAGLE is a widely used software package for phasing, genotype imputation, and identity-by-descent (IBD) analysis. Its performance is closely tied to the use of high-resolution genetic maps. Here are some distinctive features of BEAGLE's genetic map:

- **High Marker Density:**  
  The genetic maps provided with BEAGLE include a dense array of markers. This density allows for the precise capture of fine-scale recombination events, which in turn improves the accuracy of haplotype phasing and genotype imputation.

- **Species and Population Specificity:**  
  The maps are often developed from extensive pedigree or population studies. For human genetic studies, they are constructed based on large-scale recombination data, ensuring relevance to the population under study.

- **Integration with Statistical Models:**  
  BEAGLE utilizes these maps within its statistical algorithms to model recombination events effectively. This integration is crucial for accurately inferring missing genotypes and detecting IBD segments.

- **Enhanced Analysis Accuracy:**  
  The detailed recombination information in BEAGLE's genetic maps allows for better adjustment for linkage disequilibrium and recombination rates, ultimately leading to more robust downstream genetic analyses.

## Benefits of Using BEAGLE's Genetic Map

- **Improved Phasing Accuracy:**  
  The high-resolution data facilitates precise haplotype reconstruction, reducing errors in phase determination.

- **Robust Genotype Imputation:**  
  Detailed recombination rate data enhances the accuracy of imputing missing genotypes, ensuring more reliable datasets.

- **Streamlined Analysis Workflow:**  
  The genetic map is specifically tailored to integrate seamlessly with BEAGLE’s algorithms, thereby optimizing the overall analysis process.

## References

1. Browning, B. L., & Browning, S. R. (2007). *Rapid and Accurate Haplotype Phasing and Missing-Data Inference for Whole-Genome Association Studies by Use of Localized Haplotype Clustering*. [American Journal of Human Genetics](https://www.cell.com/AJHG/fulltext/S0002-9297(07)63882-8)
2. Browning, B. L., Zhou, Y., & Browning, S. R. (2018). *A One-Penny Imputed Genome from Next-Generation Reference Panels*. [American Journal of Human Genetics](https://pubmed.ncbi.nlm.nih.gov/30100085/)
3. [BEAGLE Documentation](https://faculty.washington.edu/browning/beagle/beagle.html)
4. [NHGRI Glossary: Genetic Map](https://www.genome.gov/genetics-glossary/Genetic-Map)
5. Li, Y., Willer, C., Sanna, S., & Abecasis, G. (2009). *Genotype Imputation*. [Annual Review of Genomics and Human Genetics](https://www.annualreviews.org/content/journals/10.1146/annurev.genom.9.081307.164242)

In [3]:
%%bash -s "$data_directory"

data_directory="$1"

# For Beagle data:
poetry run python -m scripts_support.genetic_maps_download --data-source BEAGLE --assembly GRCh38

  machar = _get_machar(dtype)


2025-02-06 12:55:52,089 - INFO - Results Directory: /home/ubuntu/computational_genetic_genealogy/results
2025-02-06 12:55:52,089 - INFO - Data Directory: /home/ubuntu/computational_genetic_genealogy/data
2025-02-06 12:55:52,090 - INFO - References Directory: /home/ubuntu/computational_genetic_genealogy/references
2025-02-06 12:55:52,090 - INFO - Utils Directory: /home/ubuntu/computational_genetic_genealogy/utils
2025-02-06 12:55:52,093 - INFO - Downloading Beagle map from https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip
2025-02-06 12:55:52,836 - INFO - Download progress: 2.0%
2025-02-06 12:55:52,896 - INFO - Download progress: 4.0%
2025-02-06 12:55:52,947 - INFO - Download progress: 6.0%
2025-02-06 12:55:52,952 - INFO - Download progress: 8.0%
2025-02-06 12:55:52,955 - INFO - Download progress: 10.0%
2025-02-06 12:55:53,007 - INFO - Download progress: 12.1%
2025-02-06 12:55:53,020 - INFO - Download progress: 14.1%
2025-02-06 12:55:53,035 - INFO - Downl

## Run Quality Control

This pipeline was designed such that running the following code block takes your VCF data as input, filters the data based on set metrics, and phases the data, and outputs the phased data by chromosome in the results directory.

In [None]:
%%bash -s "$data_directory"

data_directory="$1"

poetry run python -m scripts_work.quality_control_vcf \
    --vcf_file ${data_directory}/merged_opensnps_data.vcf.gz \
    --determined_sex_file ${data_directory}/class_data/determined_sex.txt \
    --failed_sex ${data_directory}/class_data/failed_sex.txt \
    --geno 0.05 --maf 0.05

That's it. You're now ready to run the IBD detection algorithm. 

## What happened?

However, let's pause a moment to see what happened here.

# VCF Quality Control and Processing Pipeline

This script implements a comprehensive quality control (QC) and processing pipeline for merged VCF files, designed specifically for downstream genetic analyses (e.g., genetic genealogy). The pipeline integrates several tools (e.g., PLINK2, bcftools, Beagle) to perform quality control, filtering, and conversion of VCF files into other formats.

---

## Overview

The pipeline performs the following major steps:

1. **Quality Control Filtering:**  
   Applies a series of default QC filters to the VCF file:
   - **Autosomal Filtering:** Only autosomal SNPs are retained (excludes X and Y chromosomes to simplify analysis).
   - **Duplicate Removal:** Duplicated SNPs are removed (keeps the first occurrence).
   - **VCF Half-Call Handling:** Half-calls are treated as missing.
   - **SNP Filtering:** Keeps only SNPs with nucleotide calls in {A, C, G, T} (ignoring case).
   - **Biallelic SNPs:** Filters for SNPs with exactly two alleles.
   - **Genotype Missingness (`--geno`):** Excludes SNPs with a missingness rate exceeding the specified threshold.
   - **Minor Allele Frequency (`--maf`):** Excludes SNPs with a frequency lower than the specified threshold.

2. **Validation and Dependency Checks:**  
   - Validates the merged VCF file by counting samples, SNPs, and verifying chromosome headers.
   - Checks that required tools (e.g., `bcftools`, `bgzip`, `plink2`) are installed and attempts to install missing components like the Beagle JAR dynamically.

3. **Sex Determination Processing:**  
   - Parses input files containing sex determination results.
   - Writes PLINK2-compatible sex files (including `.psam` and sex update files) for further analysis.

4. **Data Conversion and Filtering:**  
   - **Step 1:** Converts the VCF file into PLINK format using `plink2` with QC parameters.
   - **Step 2:** Filters the PLINK data by genotype missingness and minor allele frequency.
   - **Step 3:** Splits the dataset by chromosome and exports each chromosome’s data as a VCF file.  
     This step also includes filtering for biallelic variants, compressing, and indexing the output VCF files.

5. **Processing Special Chromosomes:**  
   - Dedicated routines handle chromosomes X, Y, and MT (mitochondrial) separately, as they may require additional processing (e.g., PAR splitting for chromosome X).

6. **Phasing:**  
   - After QC and filtering, the script calls an external shell script (`phase_chromosomes.sh`) to phase chromosomes using Beagle.  
   - The script passes an input file prefix (derived from the processed VCF file) along with directory paths and the Beagle JAR file for further phasing operations.

7. **Logging and Error Handling:**  
   - A robust logging mechanism is set up to capture both console and file outputs, enabling detailed tracking of the pipeline’s progress and errors.
   - The script employs exception handling to catch failures at different steps, ensuring the pipeline stops if critical issues are detected.

---

## Key Functions and Their Roles

- **`configure_logging()`**  
  Sets up file and console logging with configurable debug levels.

- **`parse_arguments()`**  
  Parses command-line arguments to customize parameters such as the input VCF file, QC thresholds, and sex determination files.

- **`check_dependencies()`**  
  Verifies that external tools (`bcftools`, `bgzip`, `plink2`) are available. It dynamically locates (or installs) the Beagle JAR file if it is not found.

- **`validate_merged_vcf()` and `validate_vcf()`**  
  Validate the input VCF file by:
  - Counting the number of samples and SNPs.
  - Extracting and comparing chromosome lists from the VCF header and the CHROM field.
  - Extracting sample IDs.

- **`parse_sex_determination()` and `write_sex_files()`**  
  Process sex determination outputs to generate files compatible with PLINK2, ensuring that downstream analyses correctly interpret the sex of each sample.

- **Conversion and Filtering Steps (`step_1_convert_vcf_to_plink()`, `step_2_filter_genotype_and_maf()`, `step_3_split_by_chromosome()`)**  
  Convert VCF data to PLINK format and apply additional QC filters. The pipeline then splits the data by chromosome and generates per-chromosome VCF outputs.

- **Special Chromosome Processing (`step_process_X()`, `step_process_Y()`, `step_process_MT()`)**  
  Handle conversion, filtering, and exporting for chromosomes X, Y, and MT. Each function is tailored to address the unique challenges associated with these chromosomes (e.g., sex update for chromosome X).

- **`run_command()`**  
  A helper function to execute shell commands, suppressing or capturing output as necessary, with error handling.

- **`main()`**  
  The main entry point that ties all the steps together. It orchestrates:
  - Validation of the VCF file.
  - QC and filtering processes.
  - Dependency checks and installation steps.
  - Execution of the phasing script.

---

## Usage Example

To run the script using Poetry, one might call:

```bash
poetry run python -m scripts_work.quality_control_vcf \
    --vcf_file data/open_snps_data/opensnps.vcf.gz \
    --determined_sex_file data/open_snps_data/determined_sex.txt \
    --failed_sex data/open_snps_data/failed_sex.txt \
    --geno 0.05 --maf 0.05
```

This command initiates the QC pipeline with specified thresholds for genotype missingness (--geno) and minor allele frequency (--maf), along with input files for VCF and sex determination.

## Conclusion
This pipeline is designed to provide a robust, modular framework for processing VCF files in genetic studies. By integrating multiple tools and configurable QC parameters, it balances stringency and flexibility, ensuring a high-quality dataset for downstream analyses in genetic genealogy and population genetics.

In [27]:
import subprocess

def validate_merged_vcf(vcf_path):
    """Validate merged VCF and extract available chromosomes."""
    cmd_counts = ["bcftools", "plugin", "counts", vcf_path]
    result_counts = subprocess.run(cmd_counts, capture_output=True, text=True, check=True)
    logging.info(f"Plugin 'counts' validation output for {vcf_path}:\n{result_counts.stdout}")
    if result_counts.stderr:
        logging.info(f"Plugin 'counts' validation errors:\n{result_counts.stderr}")

    num_samples = 0
    for line in result_counts.stdout.splitlines():
        if line.startswith("Number of samples:"):
            parts = line.split(":")
            if len(parts) == 2:
                num_samples = int(parts[1].strip())
    if not num_samples:
        logging.error(f"No sample count found in VCF file: {vcf_path}")

    num_snps = 0
    for line in result_counts.stdout.splitlines():
        if line.startswith("Number of SNPs:"):
            parts = line.split(":")
            if len(parts) == 2:
                num_snps = int(parts[1].strip())
    if not num_snps:
        logging.error(f"No sample count found in VCF file: {vcf_path}")


    logging.info("Extracting list of chromosomes from the VCF header.")
    cmd_chrom_contig = f"bcftools view -h {vcf_path} | grep '^##contig' | cut -d'=' -f3 | cut -d',' -f1"
    result_chrom_contig = subprocess.run(cmd_chrom_contig, shell=True, capture_output=True, text=True, check=True)
    chromosomes_contig = result_chrom_contig.stdout.splitlines()
    if not chromosomes_contig:
        logging.error(f"No chromosomes found in VCF file: {vcf_path}")
    else:
        logging.debug(f"Chromosomes found in VCF file header: {', '.join(chromosomes_contig)}")


    logging.info("Extracting a list of chromosomes from the CHROM column..")
    cmd_chrom_field = f"bcftools query -f '%CHROM\n' {vcf_path} | sort -u"
    result_chrom_field = subprocess.run(cmd_chrom_field, shell=True, capture_output=True, text=True, check=True)
    chromosomes_field = result_chrom_field.stdout.splitlines()
    if not chromosomes_field:
        logging.error(f"No chromosomes found in VCF file in the CHROM field: {vcf_path}")
    else:
        logging.debug(f"Chromosomes found in VCF file in the CHROM field: {', '.join(chromosomes_field)}")


    if chromosomes_contig != chromosomes_field:
        logging.error("Mismatch between chromosomes in contig and field headers.")
        logging.error(f"Contig chromosomes: {chromosomes_contig}")
        logging.error(f"Field chromosomes: {chromosomes_field}")


    logging.info("Extracting sample IDs from the VCF file.")
    cmd_sample_list = ["bcftools", "query", "-l", vcf_path]
    result_sample_list = subprocess.run(cmd_sample_list, capture_output=True, text=True, check=True)
    sample_ids = result_sample_list.stdout.splitlines()

    if not sample_ids:
        logging.error(f"No sample IDs found in VCF file: {vcf_path}")
    else:
        logging.debug(f"Sample IDs found in VCF file: {', '.join(sample_ids)}")

    return num_samples, num_snps, chromosomes_field, sample_ids

In [28]:
vcf_path = f"{data_directory}/merged_opensnps_data.vcf.gz"
num_samples, num_snps, chromosomes, sample_ids = validate_merged_vcf(vcf_path)
print(num_samples, num_snps, chromosomes, sample_ids)

2025-02-06 13:50:47,555 - INFO - Plugin 'counts' validation output for /home/ubuntu/computational_genetic_genealogy/data/merged_opensnps_data.vcf.gz:
Number of samples: 10
Number of SNPs:    732979
Number of INDELs:  0
Number of MNPs:    0
Number of others:  0
Number of sites:   1052288

2025-02-06 13:50:47,556 - INFO - Extracting list of chromosomes from the VCF header.
2025-02-06 13:50:47,724 - INFO - Extracting a list of chromosomes from the CHROM column..
2025-02-06 13:50:48,416 - ERROR - Mismatch between chromosomes in contig and field headers.
2025-02-06 13:50:48,417 - ERROR - Contig chromosomes: ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y', 'KI270728.1', 'KI270727.1', 'KI270442.1', 'KI270729.1', 'GL000225.1', 'KI270743.1', 'GL000008.2', 'GL000009.2', 'KI270747.1', 'KI270722.1', 'GL000194.1', 'KI270742.1', 'GL000205.2', 'GL000195.1', 'KI270736.1', 'KI270733.1', 'GL000224.1', 'GL000219.1'

In [29]:
def parse_sex_determination(determined_sex_file, failed_sex):
    """Parse the sex determination log and create a mapping of user IDs to sexes."""
    sex_mapping = {}
    with open(determined_sex_file, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "1" if sex == "Male" else "2"

    with open(failed_sex, 'r') as p:
        lines = p.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue

        user_id, sex = line.split("\t")
        sex_mapping[user_id] = "0" # Unknown sex

    # Count occurrences of each sex code
    counts = Counter(sex_mapping.values())

    # Print results
    logging.info(f"Count of SEX=0 (Unknown): {counts['0']}")
    logging.info(f"Count of SEX=1 (Male): {counts['1']}")
    logging.info(f"Count of SEX=2 (Female): {counts['2']}")

    return sex_mapping

def write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file):
    """Write both PLINK2-compatible .psam files and sex update file."""
    
    # Reorder sex_mapping based on sample_ids
    ordered_sex_mapping = {sample_id: sex_mapping.get(sample_id, "0") for sample_id in sample_ids}
    
    # Write standard .psam file for all chromosomes
    with open(psam_file_all, 'w') as f:
        f.write("#FID\tIID\tSEX\n")  # Header for .psam file
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write .psam file for Y chromosome (males only)
    with open(psam_file_Y, 'w') as f:
        f.write("#FID\tIID\tSEX\n")
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code != "1":
                continue  # Exclude non-males
            f.write(f"{user_id}\t{user_id}\t{sex_code}\n")
    
    # Write sex update file for PLINK2 --update-sex
    with open(sex_update_file, 'w') as f:
        f.write("#IID\tSEX\n")  # PLINK2 format for sex update
        for user_id, sex_code in ordered_sex_mapping.items():
            if sex_code == "0":
                continue  # Exclude unknown sexes
            f.write(f"{user_id}\t{sex_code}\n")

In [30]:
determined_sex_file = f"{data_directory}/class_data/determined_sex.txt"
failed_sex = f"{data_directory}/class_data/failed_sex.txt"

sex_mapping = parse_sex_determination(determined_sex_file, failed_sex)
base_name = os.path.splitext(determined_sex_file)[0]
psam_file_all = f"{base_name}_all.psam"
psam_file_Y = f"{base_name}_Y.psam"
sex_update_file = f"{base_name}_update_sex.txt"
write_sex_files(sex_mapping, sample_ids, psam_file_all, psam_file_Y, sex_update_file)

2025-02-06 13:51:04,926 - INFO - Count of SEX=0 (Unknown): 0
2025-02-06 13:51:04,927 - INFO - Count of SEX=1 (Male): 4
2025-02-06 13:51:04,929 - INFO - Count of SEX=2 (Female): 6


In [31]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

#The first step of the pipeline involves parsing command-line arguments and applying default quality control parameters to the merged VCF file. These filters include:

# - **Autosomal Filtering:** Only autosomal SNPs are retained.
# - **Duplicate Removal:** Duplicate SNPs are removed (keeping the first occurrence).
# - **VCF Half-Call Handling:** Half-calls are treated as missing.
# - **SNP Filtering:** Only SNPs with nucleotide types {A, C, G, T} are kept.
# - **Biallelic SNPs:** Only SNPs with exactly two alleles are retained.
# - **Genotype Missingness (`--geno`):** Excludes SNPs with a missingness rate above a threshold.
# - **Minor Allele Frequency (`--maf`):** Excludes SNPs with a frequency below a threshold.

# Change to the utils directory if necessary
cd "${utils_directory}"


plink2 --vcf ${data_directory}/merged_opensnps_data.vcf.gz \
  --autosome \
  --snps-only just-acgt \
  --rm-dup exclude-all \
  --min-alleles 2 \
  --max-alleles 2 \
  --make-pgen \
  --out ${results_directory}/opensnps_autosomes_step1


PLINK v2.0.0-a.6.4LM 64-bit Intel (6 Dec 2024)     cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1.log.
Options in effect:
  --autosome
  --make-pgen
  --max-alleles 2
  --min-alleles 2
  --out /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1
  --rm-dup exclude-all
  --snps-only just-acgt
  --vcf /home/ubuntu/computational_genetic_genealogy/data/merged_opensnps_data.vcf.gz

Start time: Thu Feb  6 13:51:13 2025
16165 MiB RAM detected; reserving 8082 MiB for main workspace.
Using up to 6 compute threads.
--vcf: 1018862 variants scanned (33426 skipped).
--vcf: 983k variants converted. 
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1-temporary.pgen
+
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1-temporary.pvar.zst
+
/home/ubuntu/computational_genetic_gene

In [32]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Change to the utils directory if necessary
cd "${utils_directory}"

echo "Filtering by genotype missingness (geno=${geno}) and minor allele frequency (maf=${maf})..."
plink2 --pfile ${results_directory}/opensnps_autosomes_step1 \
  --geno .05 \
  --maf .05 \
  --sort-vars \
  --make-pgen \
  --out ${results_directory}/opensnps_autosomes_step2

Filtering by genotype missingness (geno=) and minor allele frequency (maf=)...
PLINK v2.0.0-a.6.4LM 64-bit Intel (6 Dec 2024)     cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step2.log.
Options in effect:
  --geno .05
  --maf .05
  --make-pgen
  --out /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step2
  --pfile /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1
  --sort-vars

Start time: Thu Feb  6 13:57:06 2025
16165 MiB RAM detected; reserving 8082 MiB for main workspace.
Using up to 6 compute threads.
10 samples (0 females, 0 males, 10 ambiguous; 10 founders) loaded from
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1.psam.
711576 variants loaded from
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step1.pvar.
Note: No phenotype data 

In [None]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

In [33]:
%%bash -s "$data_directory" "$utils_directory" "$results_directory"

data_directory="$1"
utils_directory="$2"
results_directory="$3"

# Derive the sample base name from the merged VCF file name (e.g., "opensnps")
sample_file=$(basename "${data_directory}/merged_opensnps_data.vcf.gz" | cut -d. -f1)

# The input prefix is the output from step 2
input_prefix="${results_directory}/opensnps_autosomes_step2"

echo "Splitting by chromosome and exporting as VCF..."

# Loop over autosomal chromosomes 1 through 22
for chromosome in {1..22}; do
    echo "Processing chromosome ${chromosome}..."
    
    # Define the output prefix for the current chromosome
    output_prefix="${results_directory}/${sample_file}_qc_chr${chromosome}"
    
    # Export the chromosome-specific data as a VCF using plink2
    plink2 --pfile "$input_prefix" \
           --chr "${chromosome}" \
           --export vcf \
           --out "$output_prefix"
    
    # Define the input and output for bcftools processing
    bcftools_input="${output_prefix}.vcf"
    bcftools_output="${results_directory}/${sample_file}_qcfinished_chr${chromosome}.vcf.gz"
    
    echo "Filtering for biallelic variants on chromosome ${chromosome}..."
    # Filter for biallelic variants and compress the VCF using bcftools
    bcftools view -m2 -M2 -Oz -o "$bcftools_output" "$bcftools_input"
    
    echo "Indexing the filtered VCF for chromosome ${chromosome}..."
    # Index the compressed VCF file
    bcftools index "$bcftools_output"
done


Splitting by chromosome and exporting as VCF...
Processing chromosome 1...
PLINK v2.0.0-a.6.4LM 64-bit Intel (6 Dec 2024)     cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/ubuntu/computational_genetic_genealogy/results/merged_opensnps_data_qc_chr1.log.
Options in effect:
  --chr 1
  --export vcf
  --out /home/ubuntu/computational_genetic_genealogy/results/merged_opensnps_data_qc_chr1
  --pfile /home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step2

Start time: Thu Feb  6 13:58:52 2025
16165 MiB RAM detected; reserving 8082 MiB for main workspace.
Using up to 6 compute threads.
10 samples (0 females, 0 males, 10 ambiguous; 10 founders) loaded from
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step2.psam.
28154 out of 350277 variants loaded from
/home/ubuntu/computational_genetic_genealogy/results/opensnps_autosomes_step2.pvar.
Note: No phenotype data present.
--