In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab_get_opensnp_data.log")
print(f"The log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

### Get The Data

# 📥 How to Download OpenSNP Data

There are **two ways** to download OpenSNP data. You only need to select **one** method. **The preferred method is using `boto3`.**

---

## **✅ Option 1: Download Using `boto3` (Preferred)**
With `boto3` installed, you can use it to efficiently download files from the OpenSNP public S3 bucket. This method is recommended for bulk downloading and better reliability. However, how to setup boto3 will not be covered until later in the semester. For now, use option 2.

---

## **✅ Option 2: Download Using `requests` (No AWS Setup Required)**
You can use the `requests` library to download the files directly from a public S3 URLs. This method is easier to use but may be slower for large downloads.

---

## **📌 Manually Downloading Files**
If you prefer to manually download individual files:
1. Open your web browser and go to:
   ```
   https://opensnpdata.s3.us-east-2.amazonaws.com/[FILENAME]
   ```
   Replace `[FILENAME]` with the exact filename from `opensnp_file_list.txt`.

2. Example:

   https://opensnpdata.s3.us-east-2.amazonaws.com/user1001_file496_yearofbirth_unknown_sex_unknown.ancestry.txt

3. **Right-click → Save As...** to download the file.

4. Move the file to:
   ```
   data_directory/class_data/raw_dna_profiles/
   ```

---

📌 **Choose one method that works best for you.** If unsure, use **boto3** for better performance.

🚀 Happy downloading! 🚀

**Option 1: boto3 version**

In [None]:
# Define constants
BUCKET_NAME = "opensnpdata"
SAVE_DIR = f"{data_directory}/class_data/raw_dna_profiles"

# Ensure the save directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

# Initialize an anonymous S3 client
s3 = boto3.client("s3", config=boto3.session.Config(signature_version="s3v4"))

def count_files_in_bucket():
    """Count the total number of files in the OpenSNP bucket."""
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    if "Contents" in response:
        return len(response["Contents"])
    return 0

num_files = count_files_in_bucket()
print(f"Total files in bucket: {num_files}")

In [None]:
def download_files(limit=None):
    """
    Download a specified number of files (or all files) from the OpenSNP bucket.

    Parameters:
        limit (int or None): Number of files to download. If None, downloads all files.
    """
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    
    if "Contents" not in response:
        print("No files found in the OpenSNP bucket.")
        return
    
    files = response["Contents"]
    
    # Apply limit if specified
    if limit is not None:
        files = files[:limit]

    print(f"Downloading {len(files)} files...")

    for obj in files:
        file_key = obj["Key"]
        local_path = os.path.join(SAVE_DIR, os.path.basename(file_key))

        print(f"Downloading: {file_key} -> {local_path}")
        s3.download_file(BUCKET_NAME, file_key, local_path)

    print("Download completed.")


In [None]:
# Download a certain number of files (e.g., first 5)
download_files(limit=5)

In [None]:
# Download all files
download_files(limit=None)

**Option 2: non boto3 version**

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm  # Changed to notebook version of tqdm
from tqdm.auto import tqdm as tqdm_auto  # For auto-detection of environment
import signal
import sys
from dataclasses import dataclass
from typing import List, Optional
import logging

@dataclass
class DownloadConfig:
    bucket_url: str
    save_dir: str
    file_list_path: str
    max_workers: int = 5
    chunk_size: int = 8192

class ParallelDownloader:
    def __init__(self, config: DownloadConfig):
        self.config = config
        self.interrupted = False
        self.failed_downloads = []
        self.setup_logging()
        self.setup_signal_handlers()
        
    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('download_log.txt'),
                logging.StreamHandler(sys.stdout)
            ]
        )
        self.logger = logging.getLogger(__name__)

    def setup_signal_handlers(self):
        signal.signal(signal.SIGINT, self.handle_interrupt)
        signal.signal(signal.SIGTERM, self.handle_interrupt)

    def handle_interrupt(self, signum, frame):
        self.logger.warning("Received interrupt signal. Finishing current downloads...")
        self.interrupted = True

    def get_file_list(self, start: Optional[int] = None, end: Optional[int] = None) -> List[str]:
        """Read and optionally slice the file list."""
        try:
            with open(self.config.file_list_path, "r") as f:
                file_list = [line.strip() for line in f.readlines()]
            return file_list[start:end] if start is not None else file_list
        except FileNotFoundError:
            self.logger.error(f"File list not found: {self.config.file_list_path}")
            raise

    def download_file(self, filename: str, overall_pbar) -> bool:
        """Download a single file with progress tracking."""
        if self.interrupted:
            return False

        file_url = f"{self.config.bucket_url}/{filename}"
        local_path = os.path.join(self.config.save_dir, os.path.basename(filename))

        # Skip if file exists and has content
        if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
            self.logger.info(f"Skipping existing file: {filename}")
            overall_pbar.update(1)
            return True

        try:
            response = requests.get(file_url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            
            with open(local_path, "wb") as file:
                if total_size == 0:
                    file.write(response.content)
                else:
                    downloaded = 0
                    for chunk in response.iter_content(chunk_size=self.config.chunk_size):
                        if self.interrupted:
                            return False
                        if chunk:
                            file.write(chunk)
                            downloaded += len(chunk)
                            
            overall_pbar.update(1)
            return True

        except Exception as e:
            self.logger.error(f"Failed to download {filename}: {str(e)}")
            self.failed_downloads.append((filename, str(e)))
            if os.path.exists(local_path):
                os.remove(local_path)
            overall_pbar.update(1)
            return False

    def download_files(self, start: Optional[int] = None, 
                      end: Optional[int] = None, 
                      max_retries: int = 3) -> None:
        """
        Download files in parallel with retry mechanism.
        
        Args:
            start: Optional starting index for file range
            end: Optional ending index for file range
            max_retries: Maximum number of retry attempts for failed downloads
        """
        os.makedirs(self.config.save_dir, exist_ok=True)
        
        file_list = self.get_file_list(start, end)
        total_files = len(file_list)
        
        self.logger.info(f"Starting download of {total_files} files...")
        
        for attempt in range(max_retries + 1):
            if not file_list:
                break
                
            with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
                # Create the progress bar
                with tqdm_auto(total=len(file_list), 
                             desc="Downloading files", 
                             unit="file") as pbar:
                    
                    # Submit all downloads
                    future_to_file = {
                        executor.submit(self.download_file, filename, pbar): filename
                        for filename in file_list
                    }
                    
                    successful_downloads = []
                    
                    # Process completed downloads
                    for future in as_completed(future_to_file):
                        filename = future_to_file[future]
                        try:
                            if future.result():
                                successful_downloads.append(filename)
                        except Exception as e:
                            self.logger.error(f"Error downloading {filename}: {str(e)}")
                        
                        if self.interrupted:
                            self.logger.warning("Download interrupted by user.")
                            return

                # Remove successful downloads from the list
                file_list = [f for f in file_list if f not in successful_downloads]
                
                if file_list and attempt < max_retries:
                    self.logger.info(f"Retrying {len(file_list)} failed downloads... "
                                   f"(Attempt {attempt + 2}/{max_retries + 1})")
        
        if self.failed_downloads:
            self.logger.error("Failed downloads:")
            for filename, error in self.failed_downloads:
                self.logger.error(f"  {filename}: {error}")
            
        self.logger.info(f"Download completed. "
                        f"Successfully downloaded: {total_files - len(self.failed_downloads)}/{total_files}")

In [None]:
# Create config in one cell
config = DownloadConfig(
    bucket_url="https://opensnpdata.s3.us-east-2.amazonaws.com",
    save_dir=f"{data_directory}/class_data/raw_dna_profiles",
    file_list_path=f"{data_directory}/class_data/opensnp_file_list.txt",
    max_workers=5
)

# Create downloader instance
downloader = ParallelDownloader(config)

In [None]:
# You can run this cell multiple times if needed
downloader.download_files(start=5, end=15)  # Start with just 10 files as a test

In [None]:
# Download all files
downloader.download_files()