In [2]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3

from dotenv import load_dotenv

In [5]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

### Get The Data

# 📥 How to Download OpenSNP Data

There are **two ways** to download OpenSNP data. You only need to select **one** method. **The preferred method is using `boto3`.**

---

## **✅ Option 1: Download Using `boto3` (Preferred)**
With `boto3` installed, you can use it to efficiently download files from the OpenSNP public S3 bucket. This method is recommended for bulk downloading and better reliability. However, how to setup boto3 will not be covered until later in the semester. For now, use option 2.

---

## **✅ Option 2: Download Using `requests` (No AWS Setup Required)**
You can use the `requests` library to download the files directly from a public S3 URLs. This method is easier to use but may be slower for large downloads.

---

## **📌 Manually Downloading Files**
If you prefer to manually download individual files:
1. Open your web browser and go to:
   ```
   https://opensnpdata.s3.us-east-2.amazonaws.com/[FILENAME]
   ```
   Replace `[FILENAME]` with the exact filename from `opensnp_file_list.txt`.

2. Example:

   https://opensnpdata.s3.us-east-2.amazonaws.com/user1001_file496_yearofbirth_unknown_sex_unknown.ancestry.txt

3. **Right-click → Save As...** to download the file.

4. Move the file to:
   ```
   data_directory/class_data/raw_dna_profiles/
   ```

---

📌 **Choose one method that works best for you.** If unsure, use **boto3** for better performance.

🚀 Happy downloading! 🚀

**Option 1: boto3 version**

In [8]:
def download_files(limit=None):
    """
    Download a specified number of files (or all files) from the OpenSNP bucket.

    Parameters:
        limit (int or None): Number of files to download. If None, downloads all files.
    """
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    
    if "Contents" not in response:
        print("No files found in the OpenSNP bucket.")
        return
    
    files = response["Contents"]
    
    # Apply limit if specified
    if limit is not None:
        files = files[:limit]

    print(f"Downloading {len(files)} files...")

    for obj in files:
        file_key = obj["Key"]
        local_path = os.path.join(SAVE_DIR, os.path.basename(file_key))

        print(f"Downloading: {file_key} -> {local_path}")
        s3.download_file(BUCKET_NAME, file_key, local_path)

    print("Download completed.")


In [None]:
# Download a certain number of files (e.g., first 5)
download_files(limit=300)

In [None]:
# Download all files
download_files(limit=None)

**Option 2: non boto3 version**

In [None]:
# Create config in one cell
config = DownloadConfig(
    bucket_url="https://opensnpdata.s3.us-east-2.amazonaws.com",
    save_dir=f"{data_directory}/class_data/raw_dna_profiles",
    file_list_path=f"{data_directory}/class_data/opensnp_file_list.txt",
    max_workers=5
)

# Create downloader instance
downloader = ParallelDownloader(config)

In [None]:
# You can run this cell multiple times if needed
downloader.download_files(start=5, end=15)  # Start with just 10 files as a test

In [None]:
# Download all files
downloader.download_files()