In [None]:
import os
from collections import Counter
import logging
import sys
from IPython.display import Markdown, display
import subprocess
import json
import zipfile
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import os
import shutil

from pathlib import Path
from dotenv import load_dotenv

notebook_dir = Path(os.getcwd())
project_root = notebook_dir.parent
env_path = project_root / '.env'
load_dotenv(env_path, override=True)
print(f"Looking for .env at: {env_path}")

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab0_download_1kgnomes_log.txt")
print(f"The log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.

In [None]:
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

# Download 1000 Genomes Project Data

### Create the 1000 Genomes reference directory

In [None]:
try:
    # Create the reference directory path
    onethousandgenomes_seq = Path(references_directory) / "onethousandgenomes_seq"
    
    # Create directory (and parents if needed)
    onethousandgenomes_seq.mkdir(parents=True, exist_ok=True)
    
    # Display result with markdown for better visibility
    display(Markdown(f"**Reference Directory Setup:**"))
    display(Markdown(f"✅ 1000 Genomes sequence directory: `{onethousandgenomes_seq}`"))
    
    # Log for record-keeping
    logging.info(f"1000 Genomes directory configured at: {onethousandgenomes_seq}")
    
except Exception as e:
    # Visual error display
    display(Markdown(f"❌ **ERROR:** Failed to create reference directory: {str(e)}"))
    logging.error(f"Reference directory setup failed: {str(e)}")

### Chromosome Selection

In [None]:
def prompt_chromosome_selection_jupyter():
    """
    Interactive chromosome selection using IPython widgets for Jupyter.
    Returns a tuple of (selection_code, chromosome_list)
    """
    from IPython.display import display
    import ipywidgets as widgets
    from functools import partial
    
    # Set up the radio buttons for selection
    selection_widget = widgets.RadioButtons(
        options=[
            ('Range: 1 to 22', 'a'),
            ('Range: 1 to 22 plus X', 'b'),
            ('Multiple: 1 and 20 (default)', 'c'), 
            ('Single: chromosome 20', 'd'),
            ('Single: chromosome X', 'e'),
            ('Custom range', 'f')
        ],
        value='c',
        description='Chromosomes:',
        layout={'width': 'max-content'}
    )
    
    # Custom range inputs (initially hidden)
    start_dropdown = widgets.Dropdown(
        options=[str(i) for i in range(1, 23)] + ['X'],
        value='1',
        description='Start:',
        disabled=True
    )
    
    end_dropdown = widgets.Dropdown(
        options=[str(i) for i in range(1, 23)] + ['X'],
        value='22',
        description='End:',
        disabled=True
    )
    
    # Status output
    output = widgets.Output()
    
    # Store the final selection
    result = {'selection': 'c', 'chromosomes': [1, 20]}
    
    def update_custom_fields(change):
        """Enable/disable custom fields based on selection"""
        if change['new'] == 'f':
            start_dropdown.disabled = False
            end_dropdown.disabled = False
        else:
            start_dropdown.disabled = True
            end_dropdown.disabled = True
    
    def process_selection(btn):
        """Process the chromosome selection and update result"""
        with output:
            output.clear_output()
            
            selection = selection_widget.value
            options = {
                "a": list(range(1, 23)),  # Sequential range 1 to 22
                "b": list(range(1, 23)) + ["X"],  # Range 1 to 22 plus X
                "c": [1, 20],  # Default - chromosomes 1 and 20
                "d": [20],  # Single chromosome
                "e": ["X"],  # X chromosome
            }
            
            if selection == 'f':
                # Handle custom range
                start = start_dropdown.value
                end = end_dropdown.value
                
                # Convert to appropriate types
                start_val = "X" if start == "X" else int(start)
                end_val = "X" if end == "X" else int(end)
                
                # Generate chromosome range
                if start_val == "X" or end_val == "X":
                    if start_val == "X":
                        chromosomes = ["X"]
                    else:
                        chromosomes = list(range(int(start), 23)) + ["X"]
                else:
                    start_int, end_int = int(start), int(end)
                    if start_int <= end_int:
                        chromosomes = list(range(start_int, end_int + 1))
                    else:
                        chromosomes = list(range(end_int, start_int + 1))
                
                result['chromosomes'] = chromosomes
            else:
                result['chromosomes'] = options[selection]
            
            result['selection'] = selection
            print(f"Selected option: {selection}")
            print(f"Chromosomes to process: {result['chromosomes']}")
    
    # Button to confirm selection
    confirm_btn = widgets.Button(
        description='Select',
        button_style='success',
        tooltip='Click to confirm your chromosome selection'
    )
    confirm_btn.on_click(process_selection)
    
    # Update custom fields when selection changes
    selection_widget.observe(update_custom_fields, names='value')
    
    # Display the widgets
    display(selection_widget)
    display(widgets.HBox([start_dropdown, end_dropdown]))
    display(confirm_btn)
    display(output)
    
    # Return a function that gets the current selection
    def get_result():
        return result['selection'], result['chromosomes']
    
    return get_result

get_selection = prompt_chromosome_selection_jupyter()

### Download and index selected chromosomes

In [None]:
import os
import subprocess
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # Make sure to install tqdm with `pip install tqdm`

def index_downloaded_files(vcf_path):
    if os.path.isfile(vcf_path):
        logging.info(f"Indexing {vcf_path}...")
        result = subprocess.run(
            ["tabix", "-f", "-p", "vcf", vcf_path],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        if result.returncode != 0:
            error_message = result.stderr.decode().strip()
            logging.error(f"Failed to index {vcf_path}: {error_message}")
        else:
            logging.info(f"Successfully indexed {vcf_path}.")
    else:
        logging.error(f"File {vcf_path} not found.")

def process_chromosome(chromosome, base_url, destination_dir):
    # Print a status message indicating the start of processing for the chromosome
    logging.info(f"Downloading chromosome {chromosome}...")

    if chromosome == "X":
        vcf_filename = f"1kGP_high_coverage_Illumina.chr{chromosome}.filtered.SNV_INDEL_SV_phased_panel.v2.vcf.gz"
    else:
        vcf_filename = f"1kGP_high_coverage_Illumina.chr{chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"

    new_vcf_filename = f"onethousandgenomes_sequenced_phased.chr{chromosome}.vcf.gz"
    vcf_url = f"{base_url}/{vcf_filename}"

    logging.info(f"Downloading {vcf_filename} from {vcf_url}...")
    vcf_path = os.path.join(destination_dir, new_vcf_filename)
    
    result = subprocess.run(
        ["wget", "-O", vcf_path, vcf_url], check=True
    )
    if result.returncode != 0:
        logging.error(f"Failed to download {vcf_filename}. Skipping...")
        return

    if not os.path.isfile(vcf_path):
        logging.error(f"{vcf_filename} was not downloaded.")
        return

    index_downloaded_files(vcf_path)
    if not os.path.isfile(vcf_path + ".tbi"):
        logging.error(f"{vcf_filename} was not indexed")
    else:
        logging.info(f"Successfully indexed {vcf_filename}")

    logging.info(f"Successfully downloaded and verified files for chromosome {chromosome}.")

def download_index_files_parallel(base_url, chromosomes, destination_dir, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_chromosome, chrom, base_url, destination_dir): chrom for chrom in chromosomes}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing chromosomes"):
            chrom = futures[future]
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error processing chromosome {chrom}: {e}")
                

base_url = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV"
selection, chromosomes = get_selection()
download_index_files_parallel(base_url, chromosomes, onethousandgenomes_seq)


# Process the 1000 Genomes Project Data

### Download the Array Manifest file

In [None]:
def download_array_manifest(references_dir):
    print("Downloading and extracting Illumina manifest file...")
    # https://support.illumina.com/array/array_kits/infinium-global-diversity-array.html
    # https://support.illumina.com/downloads/infinium-global-diversity-array-v1-product-files.html

    # Define the URL and paths
    url = "https://webdata.illumina.com/downloads/productfiles/global-diversity-array/infinium-global-diversity-array-8-v1-0-D2-manifest-file-csv.zip"
    zip_file = os.path.join(references_dir, "infinium-global-diversity-array.zip")

    # Download the manifest ZIP file
    try:
        print(f"Downloading manifest file from {url}...")
        subprocess.run(["wget", url, "-O", zip_file], check=True)
        print(f"Download complete: {zip_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading the manifest file: {e}")
        raise

    # Extract the ZIP file
    try:
        print("Extracting manifest file...")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(references_dir)
        csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
        if len(csv_files) == 1:
            manifest_file = os.path.join(references_dir, csv_files[0])
            print(f"Manifest file located: {manifest_file}")
            print(f"Deleting ZIP file: {zip_file}")
            os.remove(zip_file)
            return manifest_file
        else:
            raise ValueError("Unexpected number of CSV files found in the ZIP archive.")
    except zipfile.BadZipFile as e:
        print(f"Error extracting ZIP file: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error during extraction: {e}")
        raise
    
manifest_file = download_array_manifest(references_directory)

### Prepare the SNP Subset file for use in subseting the 1000 Genomes Data

In [None]:
def get_snp_set(manifest_file):
    print("Reading SNP set from manifest file...")

    # Validate file existence
    if not os.path.exists(manifest_file):
        raise FileNotFoundError(f"Manifest file not found: {manifest_file}")

    # Read the first 8 lines to extract column headers
    try:
        print("Extracting column headers...")
        column_headers = []
        with open(manifest_file, 'r') as file:
            for i in range(8):
                line = file.readline()
                if i == 7:
                    column_headers = line.strip().split(',')

        print("Parsing SNP data...")
        snp_set = pd.read_csv(manifest_file, skiprows=8, header=None, low_memory=False)
        snp_set.columns = column_headers
        print(f"SNP set successfully parsed with {len(snp_set)} entries.")
        return snp_set
    except pd.errors.ParserError as e:
        print(f"Error parsing the SNP manifest file: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise

def prepare_snp_subset_file(snp_set, references_directory):
    """
    Prepares a SNP subset file for bcftools based on the SNP set from the Illumina manifest.

    Parameters:
    - snp_set (pd.DataFrame): DataFrame containing SNPs with columns 'Chr' and 'MapInfo'.
    - references_directory (str): Directory to save the SNP subset file.

    Returns:
    - str: Path to the SNP subset file.
    """
    # Drop rows with NaN values in 'MapInfo' column
    snp_set = snp_set.dropna(subset=['MapInfo'])

    # Ensure 'Chr' and 'MapInfo' columns are correctly formatted
    snp_set.loc[:, 'Chr'] = 'chr' + snp_set['Chr'].astype(str)
    snp_set.loc[:, 'MapInfo'] = snp_set['MapInfo'].astype(int)


    # Create a new DataFrame with the required columns formatted for bcftools
    formatted_df = snp_set[['Chr', 'MapInfo']].rename(columns={'Chr': 'CHROM', 'MapInfo': 'POS'})

    # Save the SNP subset file
    output_path = os.path.join(references_directory, "snp_file.txt")
    print(f"Saving SNP subset file to {output_path}...")
    formatted_df.to_csv(output_path, header=False, index=False, sep='\t')

    print(f"SNP subset file saved: {output_path}")
    return output_path

snp_set = get_snp_set(manifest_file)
logging.info(f"SNP set loaded with {len(snp_set)} entries.")

snp_subset_path = prepare_snp_subset_file(snp_set, references_directory)
logging.info(f"SNP subset file ready at {snp_subset_path}.")

### Subset the 1000 Genomes Project data

In [None]:
import os
import subprocess
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor

def subset_1000_genomes(references_directory, snp_file_path):
    """
    Subsets 1000 Genomes VCF files using Illumina SNPs and saves results in a new directory.

    Parameters:
    - references_directory (str): Path to the directory containing 1000 Genomes VCF files.
    - snp_file_path (str): Path to the SNP subset file for bcftools.
    """
    # Define directories
    input_dir = os.path.join(references_directory, "onethousandgenomes_seq")
    # Exit if input directory doesn't exist
    if not os.path.exists(input_dir):
        print(f"Input directory not found: {input_dir}")
        return
    
    output_dir = os.path.join(references_directory, "onethousandgenomes_genotype")
    os.makedirs(output_dir, exist_ok=True)

    # Identify available chromosome files in the input directory
    print("Scanning input directory for VCF files...")
    available_files = [f for f in os.listdir(input_dir)
                       if f.endswith(".vcf.gz") and not f.endswith(".vcf.gz.tbi")]

    # Extract chromosome identifiers from valid VCF filenames
    chromosomes = sorted(
        set(f.split(".")[1].replace("chr", "") for f in available_files if "chr" in f)
    )

    if not chromosomes:
        print("No valid VCF files found in the input directory. Exiting...")
        return

    print(f"Found VCF files for chromosomes: {', '.join(chromosomes)}")

    # Create chr-to-num mapping file
    mapping_file = os.path.join(output_dir, "chr_to_num.txt")
    with open(mapping_file, 'w') as f:
        for i in range(1, 23):
            f.write(f"chr{i}\t{i}\n")
        f.write("chrX\tX\n")
        f.write("chrY\tY\n")

    # Function to process a single chromosome
    def process_chromosome(chromosome):
        print(f"Starting process for chromosome {chromosome} at {datetime.now()}")

        input_vcf = os.path.join(input_dir, f"onethousandgenomes_sequenced_phased.chr{chromosome}.vcf.gz")
        output_vcf = os.path.join(output_dir, f"onethousandgenomes_genotyped_phased.chr{chromosome}.vcf.gz")

        # Check if input VCF and index files exist
        if not os.path.exists(input_vcf) or not os.path.exists(input_vcf + ".tbi"):
            print(f"Required files missing for chromosome {chromosome}: VCF or its index. Skipping...")
            return

        # First subset, then rename chromosomes
        try:
            temp_vcf = os.path.join(output_dir, f"temp.chr{chromosome}.vcf.gz")
            subprocess.run(
                ["bcftools", "view", "-R", snp_file_path, input_vcf, "-Oz", "-o", temp_vcf],
                check=True
            )
            subprocess.run([
                "bcftools", "annotate",
                "--rename-chrs", mapping_file,
                "-Oz", "-o", output_vcf,
                temp_vcf
            ], check=True)
            os.remove(temp_vcf)
            print(f"Subsetted and renamed VCF saved for chromosome {chromosome}.")
        except subprocess.CalledProcessError as e:
            print(f"Error processing VCF file for chromosome {chromosome}: {e}")
            if os.path.exists(temp_vcf):
                os.remove(temp_vcf)
            return

        # Index the subset VCF file
        try:
            subprocess.run(["tabix", "-p", "vcf", output_vcf], check=True)
            print(f"Indexing completed for chromosome {chromosome}.")
        except subprocess.CalledProcessError as e:
            print(f"Failed to index VCF file for chromosome {chromosome}: {e}")
            return

        # Count SNPs in the subsetted file
        try:
            result = subprocess.run(
                ["bcftools", "view", "-v", "snps", output_vcf],
                stdout=subprocess.PIPE,
                text=True,
                check=True
            )
            snp_count = sum(1 for line in result.stdout.splitlines() if not line.startswith("#"))
            print(f"Number of SNPs in chromosome {chromosome}: {snp_count}")
        except subprocess.CalledProcessError as e:
            print(f"Failed to count SNPs in chromosome {chromosome}: {e}")
            return

        print(f"Finished processing for chromosome {chromosome} at {datetime.now()}")

    # Run process_chromosome concurrently for all chromosomes
    max_workers = min(len(chromosomes), os.cpu_count() or 1)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map the function over chromosomes concurrently
        list(executor.map(process_chromosome, chromosomes))

    print("All chromosomes processed successfully.")

subset_1000_genomes(references_directory, snp_subset_path)

# Cleanup 

Delete the onethousandgenomes_seq directory. The subsetted data is in the onethousandgenomes_genotype directory.

In [None]:
directory_name = os.path.join(references_directory, "onethousandgenomes_seq")
if os.path.exists(directory_name):
    shutil.rmtree(directory_name)

NOTE: With these new subsetted genome VCF files, we should add code to revise the headers so that they reflect the actual data.