In [None]:
import os
import sys
import pandas as pd
import ipywidgets as widgets
from lineage import Lineage
from tqdm import tqdm
import logging

from collections import Counter
from pathlib import Path
from dotenv import load_dotenv

from IPython.display import display, HTML, Javascript

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

## Setup Logging

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

Notice that the `log_file_debug_level` and `console_debug_level` are set to `INFO` in the follwing cell.

In [None]:
# Set up logging for testing
log_filename_test = os.path.join(results_directory, "test_log.txt")
print(f"The test log file is located at {log_filename_test}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename_test):
    with open(log_filename_test, 'w') as file:
        pass  # The file is now created.

In [None]:
# Test case 1: INFO level for both file and console

clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename_test, log_file_debug_level="INFO", console_debug_level="INFO")

In [None]:
# This is how to set messages within your script.
# See https://docs.python.org/3/library/logging.html for more information.
logging.debug("DEBUG message: This should NOT appear when set to INFO level.")
logging.info("INFO message: This should appear in both the log file and console.")
logging.warning("WARNING message: This should appear in both the log file and console.")

# To verify:
# 1. Check the console output. Only INFO and WARNING messages should be printed.
# 2. Open 'test_log.txt' and verify that only INFO and WARNING messages are logged.

Notice that the `log_file_debug_level` and `console_debug_level` are set to `DEBUG` in the follwing cell.

In [None]:
# Test case 2: DEBUG level for both file and console

clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename_test, log_file_debug_level="DEBUG", console_debug_level="DEBUG")

In [None]:
# These are the exact same log messages as before

logging.debug("DEBUG message: This should NOT appear when set to INFO level.")
logging.info("INFO message: This should appear in both the log file and console.")
logging.warning("WARNING message: This should appear in both the log file and console.")

# Verify that now all three message types are logged in both the console and the log file.

In [None]:
# Delete the test log file if it exists
if os.path.exists(log_filename_test):
    os.remove(log_filename_test)
    print(f"{log_filename_test} has been deleted.")
else:
    print(f"{log_filename_test} does not exist.")

Now that we have tested our logger, let's set our real log file for this lab.

In [None]:
log_filename = os.path.join(results_directory, "lab3_log.txt")
print(f"The Lab 3 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

In [None]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

When you check your new lab log file, nothing will be there at the moment. You have only created the log file and reconfigured your logger. After you run cells with the `logging` command, you will see new logs in your file.  

## Step 1: Parsing Genotype Files

# Parsing Genotype Files

The following code block is responsible for parsing and processing raw genotype files located within a specified target subdirectory. Its primary objectives are to standardize the genotype data, ensure alignment to the GRCh38 (Build 38) reference genome, determine biological sex based on SNP data, and convert the processed information into TSV files for subsequent VCF conversion.

## Key Features

- **Directory Setup**
  - Creates a subdirectory (`parsed_tsv_files`) within the target directory to store processed TSV files.
  
- **Lineage Object Initialization**
  - Instantiates a `Lineage` object using the output directory, reference resources, and parallel processing options (10 processes) for efficient data handling.

- **File Iteration and User Identification**
  - Iterates over all files in the target subdirectory, filtering for files that start with `"user"`.
  - Aggregates file paths by user identifier to manage cases where a single user may have multiple genotype files.

- **Profile Creation and Data Standardization**
  - Creates an individual profile for each user using the `Lineage` object.
  - Validates the profile and checks whether it is mapped to Build 38. If not, attempts to remap and logs the outcome.
  
- **Sex Determination**
  - Determines the sex of each individual based on specific thresholds for heterozygous SNPs on the X chromosome and the presence of Y chromosome SNPs.
  - Logs successful sex determinations as well as any failures for further review.

- **Progress Tracking and Logging**
  - Utilizes `tqdm` to display a progress bar during the file processing.
  - Maintains comprehensive logging, including error handling for remapping and sex determination issues.
  
- **Output Generation**
  - Writes consolidated logs to files within the target directory (e.g., general errors, determined sex, and failed sex determinations).
  - Returns the directory containing the parsed TSV files, which are used in later steps of the pipeline.
  
This module is critical for ensuring that raw genotype data is consistently processed and standardized, laying a robust foundation for accurate VCF conversion and further genetic analysis.

#### Check the data

This lab assumes that you have at least one approapriate data file in your data directory. It also assumes that the file(s) are from OpenSNP which has a naming convention of starting each file with "user" and a user ID. If this is not your data structure, you will need to adjust the code for your situation.

Let's check our data.

In [None]:
def check_data_files(raw_dna_profiles):
    filenames = os.listdir(raw_dna_profiles)
    # Filter files that follow the naming convention (e.g., start with "user")
    data_files = [f for f in filenames if f.startswith("user")]
    file_count = len(data_files)
    
    if file_count == 0:
        logging.error(f"No data files found in {raw_dna_profiles} that follow the naming convention.")
    else:
        logging.info(f"Found {file_count} data file(s) in {raw_dna_profiles}.")
    
    return data_files, file_count

In [None]:
# Check for data files
class_data_directory = os.path.join(data_directory, "class_data")
raw_dna_profiles = os.path.join(class_data_directory, "raw_dna_profiles")
data_files, count = check_data_files(raw_dna_profiles)

# If this code snippet was part of a full script, you could insert checks such as this
# to gracefully exit the script when needed. 
if count == 0:
    raise ValueError("No valid data files found. Aborting parsing.")
else:
    print(f"Found {count} data files in {raw_dna_profiles}.")

In [None]:
def get_user_file_paths(raw_dna_profiles):
    user_files = {}
    filenames = os.listdir(raw_dna_profiles)
    
    for filename in filenames:
        # Only process files that follow the expected naming convention
        if not filename.startswith("user"):
            continue
        
        file_path = os.path.join(raw_dna_profiles, filename)
        logging.info(f"Processing file {filename} at {file_path}...")
        
        # Extract user identifier (assuming it is the part before the first underscore)
        user_id = filename.split('_')[0]
        logging.info(f"Extracted User ID: {user_id}")
        
        # Append file_path to the list corresponding to user_id
        if user_id not in user_files:
            user_files[user_id] = [file_path]
        else:
            user_files[user_id].append(file_path)
    
    return user_files

In [None]:
# Extract user IDs and their corresponding file paths
user_files = get_user_file_paths(raw_dna_profiles)

The output for the previous cell should have given you the INFO as it ran the code. With a small number of files, this output menthod is okay. Imagine if it was hundreds or thousands of files. Think about how you might change the logger levels in `get_user_file_paths()`.

The previous function gave us a dictionary in the format of `key: value`, which, in this case, is `userID: filepath`. Let's take a look at it.

In [None]:
user_files

In [None]:
def parse_genotype_files(raw_dna_profiles, references_directory, user_files):
    """
    Parses all genotype files, ensures Build 38, determines sex, and converts to VCF.
    """
    logging.info(f"Parsing and processing genotype files in {raw_dna_profiles}...")

    # Create a directory to store the processed TSV files
    tsv_dir = os.path.join(class_data_directory, "parsed_tsv_files")
    os.makedirs(tsv_dir, exist_ok=True)

    # Initialize the Lineage object with parallel processing options
    lineages = Lineage(
        output_dir=tsv_dir, 
        resources_dir=references_directory, 
        parallelize=True, 
        processes=10
    )

    # Initialize logs and tracking lists
    failed_sample = []
    failed_files_remapping = []  # Track files that fail processing or remapping
    determined_sex_entries = []  # Track successful sex determinations
    failed_sex_entries = []      # Track failed sex determinations

    # Define file paths for logging and results
    log_file_path = os.path.join(class_data_directory, "parse_genotype_files.log")
    determined_sex_file_path = os.path.join(class_data_directory, "determined_sex.txt")
    failed_sex_file_path = os.path.join(class_data_directory, "failed_sex.txt")

    total_users = len(user_files)
    logging.info(f"Found {total_users} user(s) to process.")

    # Process each user based on the extracted file paths
    with tqdm(total=total_users, desc="Processing users", file=sys.stdout) as pbar:
        for user_id, file_paths in user_files.items():
            logging.info("\n")
            logging.info(f"Processing user {user_id} with file(s): {file_paths}")
            
            # Attempt to create an individual profile for the user
            try:
                profile = lineages.create_individual(user_id, file_paths)
            except Exception as e:
                logging.error(f"Failed to process files for user {user_id}: {e}")
                failed_files_remapping.append(user_id)
                pbar.update(1)
                continue
            
            if profile.count == 0:
                failed_sample.append(user_id)
                logging.error(f"Failed to process files for user {user_id}")
                continue

            # Ensure Build 38
            try:
                if not profile.build_detected or profile.build != 38:
                    logging.info(f"{user_id}: Current build is {profile.build}. Attempting to remap to Build 38...")
                    # chromosomes_remapped, chromosomes_not_remapped = profile.remap(38)
                    profile.remap(38)

                    # if chromosomes_not_remapped:
                    #     logging.warning(f"{user_id}: Some chromosomes could not be remapped: {chromosomes_not_remapped}")

                    if profile.build != 38:
                        logging.error(f"{user_id}: Remapping failed. Still not in Build 38.")
                        failed_files_remapping.append(user_id)
                        continue  # Skip further processing for this file
                    else:
                        logging.info(f"{user_id}: Successfully remapped to Build 38.")
                else:
                    logging.info(f"{user_id}: Already in Build 38.")
            except Exception as e:
                logging.error(f"{user_id}: Error during remapping to Build 38: {e}")
                failed_files_remapping.append(user_id)
                logging.info("\n")
                continue   # Skip to the next iteration of the loop

            logging.debug(f"Saving profile {user_id}")
            profile.save(user_id + ".tsv")

            # Determine sex
            try:
                sex = profile.determine_sex(
                    heterozygous_x_snps_threshold=0.03,
                    y_snps_not_null_threshold=0.3,
                    chrom='X'
                )
                logging.info(sex)
                if sex:
                    determined_sex_entries.append(f"{user_id}\t{sex}")
                    logging.info(f"Determined sex for {user_id}: {sex}")
                else:
                    failed_sex_entries.append(f"{user_id}\tLow Confidence")
                    logging.warning(f"Failed to determine sex for {user_id}: Low Confidence")
            except Exception as e:
                failed_sex_entries.append(f"{user_id}\tError: {e}")
                logging.error(f"Error determining sex for {user_id}: {e}")

            pbar.update()

    # Write consolidated logs
    with open(determined_sex_file_path, "w") as log_file:
        log_file.write("\n".join(determined_sex_entries) + "\n\n")

    with open(failed_sex_file_path, "w") as log_file:
        log_file.write("\n".join(failed_sex_entries) + "\n")

    with open(log_file_path, "w") as log_file:
        log_file.write("\n".join(failed_sample) + "\n")

    return tsv_dir

Note: When using the lineage package, it will download a file to help convert files to build 38. For example, you will see `Downloading ../references/GRCh37_GRCh38.tar.gz` in the log outputs used to convert the files from build 37 to build 38. If you go to your references directory, you'll see this file.

In [None]:
parse_genotype_files(class_data_directory, references_directory, user_files)

### What happened?

The `parse_genotype_files()` function, created a subdirectory in `data/class_data` called `parsed_tsv_files`. In the `parsed_tsv_files` subdirectory, you will see a new genotype profile for each user. Take a look and see that the files are there. Look at the contents of at least one of the files. You're looking at a person's genotype data!

The `parse_genotype_files()` function also created three files in the `data/class_data` directory based on the following code snippet in the function.

```
    log_file_path = os.path.join(class_data_directory, "parse_genotype_files.log")
    determined_sex_file_path = os.path.join(class_data_directory, "determined_sex.txt")
    failed_sex_file_path = os.path.join(class_data_directory, "failed_sex.txt")
```

Take a look at the contents of these files (some may be empty). It's important to understand the structure of your data for proper computations.

# What Makes a FASTA File the "Reference" Genome for Humans?

A **FASTA file** is a widely used text-based format for representing nucleotide (DNA/RNA) or protein sequences. It is structured to contain sequence data along with an identifying **header**.

A **reference genome** is a **high-quality, curated DNA sequence** that serves as a **standard** for comparing and analyzing other genomes. The **FASTA file** used as the **human reference genome** contains the **consensus sequence of human DNA**, which researchers use as a **baseline** for mapping and identifying genetic variations.

---

## 1️⃣ What Defines a Reference Genome?
A **reference genome** is:
✅ **Assembled from multiple human samples** → It does not represent a single individual’s genome but an **aggregate "best guess"** of the human genome.  
✅ **Organized by chromosomes** → The sequence is **divided into chromosomes** (chr1, chr2, ..., chrX, chrY, chrMT for mitochondria).  
✅ **Continuously updated** → It is revised over time as sequencing technology improves.  
✅ **Labeled with precise coordinates** → Every base pair position is assigned a fixed **genomic coordinate** (e.g., `chr1:1000000`).  

---

## 2️⃣ Why Is It Called a "Reference" Genome?
- It provides a **consistent framework** for genetic studies.
- Variations (SNPs, insertions, deletions, structural variants) are identified **relative** to this reference.
- It is **not representative of all human diversity**, but it serves as a standardized **comparison point**.

---

## 3️⃣ Sources of the Human Reference Genome
The most widely used human reference genomes are:

| **Version** | **Source** | **Features** |
|-------------|-----------|--------------|
| **GRCh38/hg38** | Genome Reference Consortium (GRC) | The most up-to-date, widely used reference genome. |
| **GRCh37/hg19** | UCSC Genome Browser, Ensembl | Older but still used for compatibility with legacy datasets. |
| **T2T-CHM13** | Telomere-to-Telomere Consortium | A complete reference with full centromeres and telomeres. |

---

## 4️⃣ How Is a Reference Genome Stored in a FASTA File?
A **reference genome FASTA file** contains:
1. **Chromosome names** (headers starting with `>chrN`).
2. **DNA sequences** (A, T, G, C, and N for unknown regions).

### Example (GRCh38 reference FASTA snippet):
```
chr1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN AGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG TTCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT chr2 AGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
```

🔹 **Why the "N" Bases?**  
- `N` means "unknown base" due to **low sequencing coverage** or **complex repetitive regions**.

---

## 5️⃣ How the Reference Genome Is Used
🔬 **Mapping Reads:** Aligning sequencing data (FASTQ) to the reference to identify variations.  
🧬 **Variant Calling:** Identifying SNPs, insertions, deletions, and structural variants.  
📍 **Genome Annotation:** Identifying genes, exons, and regulatory elements.  
🧪 **Disease Research:** Comparing patient genomes to the reference to find disease-associated mutations.

### Example command to **align sequencing data to a reference genome**:
```bash
bwa mem Homo_sapiens.GRCh38.dna.allchromosomes.fa sample_reads.fastq > aligned.sam
```

### **Key Features of FASTA Files**
- **Simple format:** Readable by humans and bioinformatics tools.
- **Supports large datasets:** Used for entire genomes and protein databases.
- **Compatible with major bioinformatics tools:** Used in `samtools`, `bcftools`, `BLAST`, and sequence alignment programs.

### **Common Uses of FASTA Files**
✅ **Reference Genomes:** FASTA files are used as reference sequences in **genome alignment** and **variant calling**.  
✅ **BLAST Searches:** Querying DNA or protein databases for sequence similarity.  
✅ **Multiple Sequence Alignments:** Used in phylogenetics and evolutionary analysis.  
✅ **Genome Annotation:** Identifying genes and functional elements in DNA sequences.

### **How to Work with FASTA Files**
- **View a FASTA file**:
    ```
    head -n 20 genome.fa
    ```
- **Extract a specific chromosome**:
    ```
    samtools faidx genome.fa chr1:100000-200000
    ```
- **Search for a sequence within a FASTA file**:
    ```
    grep -A 2 "AGCTAGCTAGCT" genome.fa
    ```
FASTA files are essential in bioinformatics for storing and analyzing genetic data. 🚀

Download the FASTA file.

In [None]:
%%bash -s "$references_directory"

references_directory="$1"

mkdir -p "${references_directory}/fasta/GRCh38/"

# Source Ensembl

wget --continue --retry-connrefused --timeout=60 --waitretry=60 --tries=3 \
    ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz \
    --output-document=${references_directory}/fasta/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz

gunzip ${references_directory}/fasta/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz

samtools faidx ${references_directory}/fasta/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa

# # FASTA file sources
# # following the suggestion from Heng Li (with chr prefix)
# wget --continue --retry-connrefused --timeout=60 --waitretry=60 --tries=3 \
#     ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \
#     --output-document=${references_directory}/fasta/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
# # UCSC (with chr prefix)
# wget -O ${references_directory}/fasta/GRCh38/GRCh38.fa.gz \
#     ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
# # Ensembl (without chr prefix)
# wget -O ${references_directory}/fasta/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz \
#     ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
    

## Step 2: Converting TSV to VCF

In [None]:
%%bash -s "$references_directory" "$data_directory"

references_directory="$1"
data_directory="$2"
TSV_DIR="${data_directory}/class_data/parsed_tsv_files" # Directory containing the TSV files generated from the parsing step
VCF_DIR="${data_directory}/class_data/converted_vcf_files"  # Directory where converted VCF files will be stored
REFERENCE_FASTA="${references_directory}/fasta/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

# Create the output directory if it doesn't exist
mkdir -p "${VCF_DIR}"

# Loop through each TSV file in the TSV_DIR
for TSV_FILE in "${TSV_DIR}"/*.tsv; do
    # Extract the base name (user_id) from the TSV file name (assumes filename format: userID.tsv)
    USER_ID=$(basename "${TSV_FILE}" .tsv)
    
    # Define the output VCF filename based on the user_id
    OUTPUT_VCF="${VCF_DIR}/${USER_ID}.vcf.gz"
    
    echo "Converting ${TSV_FILE} for user ${USER_ID}..."
    
    NUM_THREADS=$(nproc)  # Get the number of available CPU threads

    echo "Using ${NUM_THREADS} threads for bcftools conversion."
    
    # Run bcftools to convert the TSV file to a compressed VCF file
    # Explanation of options:
    # --haploid2diploid: Convert haploid genotypes to diploid.
    # --tsv2vcf: Specify conversion from TSV to VCF.
    # --columns: Map the TSV columns to VCF fields (ID, CHROM, POS, AA).
    # --fasta-ref: Provide the reference genome.
    # --samples: Name the sample using the user ID.
    # --threads: Use 10 threads for faster processing.
    # --output-type: Output as a compressed VCF (bgzip).
    # --output: Specify the output VCF file path.
    bcftools convert \
        --haploid2diploid \
        --tsv2vcf "${TSV_FILE}" \
        --columns ID,CHROM,POS,AA \
        --fasta-ref "${REFERENCE_FASTA}" \
        --samples "${USER_ID}" \
        --threads "${NUM_THREADS}" \
        --output-type z \
        --output "${OUTPUT_VCF}"

        
    # Check if bcftools conversion was successful
    if [ $? -ne 0 ]; then
        echo "Error: VCF conversion failed for ${USER_ID}" >&2
        continue  # Move to the next TSV file if conversion fails
    fi
    
    # Index the newly created VCF file using tabix
    tabix -p vcf "${OUTPUT_VCF}"
    if [ $? -ne 0 ]; then
        echo "Error: VCF indexing failed for ${USER_ID}" >&2
        continue  # Move to the next file if indexing fails
    fi

    echo "Conversion and indexing successful for ${USER_ID}"
done

echo "VCF conversion process completed for all TSV files."


# **Understanding VCF Conversion Summary Metrics**

During the conversion of genotype TSV files to VCF format, several statistics are generated for each file. These statistics provide insight into the **quality**, **completeness**, and **content** of the converted data.

---

## **1. Rows Total**
- This represents the **total number of rows** (or SNP records) present in the input TSV file.
- Each row corresponds to a **single variant (SNP)** for the individual.

**Example Output:**
> Rows total: **649855**

---

## **2. Rows Skipped**
- This indicates the number of rows in the TSV file that were **skipped** during conversion.
- Possible reasons for skipping rows include:
  - **Missing data**
  - **Chromosome mismatch**
  - **Invalid genotype format**
  - **Sites that do not map correctly to the reference genome**

**Example Output:**
> Rows skipped: **8827**

🔹 **Interpretation:**  
If a large number of rows are skipped, it may indicate **data formatting issues** or an **incorrect reference genome**.

---

## **3. Sites Written**
- This is the number of **variant sites successfully written** to the VCF file.
- It is calculated as:

  **Sites written = Rows total - Rows skipped**

**Example Output:**
> Sites written: **668568**

🔹 **Interpretation:**  
A high value here means that **most SNPs were successfully converted** into the VCF file.

---

## **4. Missing GTs (Missing Genotypes)**
- Represents the number of **missing genotype calls** (denoted as `--` in the TSV file).
- This happens when:
  - The genotyping process **fails to detect an allele**.
  - There is **low sequencing coverage**, meaning this position wasn't confidently called.
  - The reference panel used for imputation did not include a likely genotype.

**Example Output:**
> Missing GTs: **331**

🔹 **Interpretation:**  
- A high **Missing GTs** count suggests **poor-quality SNP calls** or potential **data loss**.

---

## **5. Hom RR (Homozygous Reference)**
- This is the count of sites where the individual is **homozygous for the reference allele**.
- Example: If the reference genome has **A** at a position, and the individual's genotype is **AA**, it counts as **Hom RR**.

**Example Output:**
> Hom RR: **350352**

🔹 **Interpretation:**  
- A high **Hom RR** count means the individual has **many sites matching the reference genome**.

---

## **6. Het RA (Heterozygous Reference/Alternative)**
- This is the count of sites where the individual carries **one reference allele and one alternative allele**.
- Example: If the reference genome has **A**, and the individual's genotype is **AG**, this is a **heterozygous site**.

**Example Output:**
> Het RA: **199609**

🔹 **Interpretation:**  
- This value represents **genetic variation**—the higher it is, the **more heterozygous sites** the individual has.

---

## **7. Hom AA (Homozygous Alternative)**
- This is the count of sites where the individual is **homozygous for the alternative allele**.
- Example: If the reference genome has **A**, but the individual has **GG**, it is counted as **Hom AA**.

**Example Output:**
> Hom AA: **118818**

🔹 **Interpretation:**  
- A high **Hom AA** count suggests that the **individual carries many variations from the reference genome**.

---

## **8. Het AA (Heterozygous Alternative/Alternative)**
- This is a **rare case** where an individual has **two different alternative alleles at the same site**.
- Example: A multi-allelic site where one chromosome carries **G** and the other carries **A** → **GA**.

**Example Output:**
> Het AA: **149**

🔹 **Interpretation:**  
- This generally happens in **multi-allelic sites**, which are less common than standard biallelic SNPs.

---

## **Final Interpretation: What These Numbers Mean**
These metrics help assess:
1. **Data quality**: 
   - High **Rows Skipped** → Possible **format issues** or **reference mismatch**.
   - High **Missing GTs** → Potential **poor-quality genotype calls**.
  
2. **Variant composition**:
   - High **Hom RR** → Many SNPs match the reference genome.
   - High **Het RA** → More **genetic diversity** in the sample.

3. **Potential issues in reference genome compatibility**:
   - If **Rows Skipped is high**, verify that the **TSV file matches the reference genome** used in `bcftools`.
   - If the **Het RA to Hom AA ratio** is significantly skewed, consider whether the dataset contains **multi-allelic sites**.
---

## **Example Summary**
Here’s how a typical conversion result looks:

- **Rows total:** `701039` → **Total SNP sites in the TSV file**  
- **Rows skipped:** `1` → **Only 1 site was skipped (good quality data)**  
- **Sites written:** `701038` → **Nearly all SNPs were successfully written to VCF**  
- **Missing GTs:** `16262` → **16,262 sites have missing genotype data**  
- **Hom RR:** `350727` → **Half of the sites match the reference genome**  
- **Het RA:** `202225` → **202,225 heterozygous sites (one ref, one alt allele)**  
- **Hom AA:** `131708` → **131,708 sites where both alleles are alternative**  
- **Het AA:** `116` → **Very rare cases where two alternative alleles are present**  

## **Potential Errors and Warnings**
- **Too many rows skipped?**  
  - Double-check the **TSV file format** and **reference genome compatibility**.  
  - Ensure that chromosomes are properly labeled (`1-22, X, Y`).  
  - If using a liftover tool, verify that the conversion was **successful**.

- **Too many missing genotypes (`--`)?**  
  - This could indicate **poor-quality sequencing or genotyping**.  
  - If filtering, consider setting a **minimum call rate threshold**.  
  - If working with ancient DNA or degraded samples, missingness may be **expected**.

## Step 3: Merging VCF Files

In [None]:
%%bash -s "$data_directory"

data_directory="$1"
VCF_DIR="${data_directory}/class_data/converted_vcf_files"  # Directory where converted VCF files will be stored
MERGED_VCF="${data_directory}/class_data/merged_opensnps_data.vcf.gz"  # Final merged VCF file

# Collect all VCF files to merge
vcf_files=(${VCF_DIR}/*.vcf.gz)

# Check if VCF files exist
if [ ${#vcf_files[@]} -eq 0 ]; then
    echo "Error: No VCF files found in ${VCF_DIR}" >&2
    exit 1
fi

# Merge all VCF files
echo "Merging ${#vcf_files[@]} VCF files..."
bcftools merge -O z -o "${MERGED_VCF}" "${vcf_files[@]}"

# Check if merging was successful
if [ $? -ne 0 ]; then
    echo "Error: VCF merging failed." >&2
    exit 1
fi

# Index the merged VCF file
echo "Indexing the merged VCF file..."
bcftools index -t "${MERGED_VCF}"

expected_num_samples=${#vcf_files[@]}
num_samples=$(bcftools query -l "${MERGED_VCF}" | wc -l)

if [ "$num_samples" -ne "$expected_num_samples" ]; then
    echo "Warning: Sample count mismatch in ${MERGED_VCF} - Expected: $expected_num_samples, Found: $num_samples" >&2
else
    echo "Merged VCF file created and Validation successful. Sample count matches: $num_samples"
fi

## Explore your Data

In [None]:
%%bash -s "$data_directory"

data_directory="$1"
MERGED_VCF="${data_directory}/class_data/merged_opensnps_data.vcf.gz"  # Final merged VCF file

echo "Get the number of samples"
bcftools query -l "${MERGED_VCF}" | wc -l
echo

echo "Displaying the full VCF header:"
bcftools view -h "${MERGED_VCF}"

In [None]:
%%bash -s "$data_directory"

data_directory="$1"
MERGED_VCF="${data_directory}/class_data/merged_opensnps_data.vcf.gz"  # Final merged VCF file

echo "Get the stats"
bcftools stats "${MERGED_VCF}"