# Lab 00: Instructor Guide for AWS S3 and Data Management

This notebook contains commands and instructions for managing course data in AWS S3 and setting up notebooks for student use through Google Colab or JupyterLite.

## Overview

- AWS S3 provides scalable cloud storage for course data files
- Google Colab offers a free, accessible Jupyter environment requiring S3 data access
- JupyterLite provides an in-browser option with bundled data

## 1. AWS S3 Commands

### Setting up AWS CLI

Make sure the AWS CLI is installed and configured with appropriate credentials:

In [None]:
# 🧬 Google Colab Setup - Run this cell first!
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from IPython.display import display, HTML, Markdown

def is_colab():
    '''Check if running in Google Colab'''
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_colab():
    print("🔬 Setting up Google Colab environment...")
    
    # Install dependencies
    print("📦 Installing packages...")
    !pip install -q pysam biopython scikit-allel networkx pygraphviz seaborn plotly
    !apt-get update -qq && apt-get install -qq samtools bcftools tabix graphviz-dev
    
    # Create directories
    !mkdir -p /content/class_data /content/results
    
    # Download essential class data
    print("📥 Downloading class data...")
    S3_BASE = "https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/"
    data_files = [
        "pedigree.fam", "pedigree.def", 
        "merged_opensnps_autosomes_ped_sim.seg",
        "merged_opensnps_autosomes_ped_sim-everyone.fam",
        "ped_sim_run2.seg", "ped_sim_run2-everyone.fam"
    ]
    
    for file in data_files:
        !wget -q -O /content/class_data/{file} {S3_BASE}{file}
        print(f"  ✅ {file}")
    
    # Define utility functions
    def setup_environment():
        return "/content/class_data", "/content/results"
    
    def save_results(dataframe, filename, description="results"):
        os.makedirs("/content/results", exist_ok=True)
        full_path = f"/content/results/{filename}"
        dataframe.to_csv(full_path, index=False)
        display(HTML(f'''
        <div style="padding: 10px; background-color: #e3f2fd; border-left: 4px solid #2196f3; margin: 10px 0;">
            <p><strong>💾 Results saved!</strong> To download: 
            <code>from google.colab import files; files.download('{full_path}')</code></p>
        </div>
        '''))
        return full_path
    
    def save_plot(plt, filename, description="plot"):
        os.makedirs("/content/results", exist_ok=True)
        full_path = f"/content/results/{filename}"
        plt.savefig(full_path, dpi=300, bbox_inches='tight')
        plt.show()
        display(HTML(f'''
        <div style="padding: 10px; background-color: #e8f5e8; border-left: 4px solid #4caf50; margin: 10px 0;">
            <p><strong>📊 Plot saved!</strong> To download: 
            <code>from google.colab import files; files.download('{full_path}')</code></p>
        </div>
        '''))
        return full_path
    
    print("✅ Colab setup complete! Ready to explore genetic genealogy.")
    
else:
    print("🏠 Local environment detected")
    def setup_environment():
        return "class_data", "results"
    def save_results(df, filename, description=""):
        os.makedirs("results", exist_ok=True)
        path = f"results/{filename}"
        df.to_csv(path, index=False)
        return path
    def save_plot(plt, filename, description=""):
        os.makedirs("results", exist_ok=True)
        path = f"results/{filename}"
        plt.savefig(path, dpi=300, bbox_inches='tight')
        plt.show()
        return path

# Set up paths and configure visualization
DATA_DIR, RESULTS_DIR = setup_environment()
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook")

# 1. Upload all files from data/class_data to S3 without any exclusions
!aws s3 sync data/class_data/ s3://computational-genetic-genealogy/class_data/

# Note: This command uploads all files including large directories
# The upload may take some time depending on the size of the data
# If needed for a specific file, you can use:
!aws s3 cp data/class_data/specific_file.vcf.gz s3://computational-genetic-genealogy/class_data/

In [None]:
# Create S3 bucket (if it doesn't exist)
!aws s3 mb s3://computational-genetic-genealogy --region us-east-2

# Configure bucket for public access (only if needed for course materials)
!aws s3api put-public-access-block --bucket computational-genetic-genealogy \
  --public-access-block-configuration "BlockPublicAcls=false,IgnorePublicAcls=false,BlockPublicPolicy=false,RestrictPublicBuckets=false"

# Set bucket policy for public read access
!aws s3api put-bucket-policy --bucket computational-genetic-genealogy --policy '{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "PublicReadGetObject",
      "Effect": "Allow",
      "Principal": "*",
      "Action": "s3:GetObject",
      "Resource": "arn:aws:s3:::computational-genetic-genealogy/*"
    }
  ]
}'

### Uploading Data to S3

Upload class data to the S3 bucket:

In [None]:
# 1. Upload all files from data/class_data to S3 (excluding large subdirectories)
!aws s3 sync data/class_data/ s3://computational-genetic-genealogy/class_data/ \
  --exclude "*/phased_samples/*" \
  --exclude "*/unphased_samples/*" \
  --exclude "*/segments/*" \
  --exclude "*opensnps_data_autosomes/segments/*" \
  --exclude "*opensnps_data_autosomes/unphased_samples/*"

# 2. Upload specific large files if needed
!aws s3 cp data/class_data/specific_large_file.vcf.gz s3://computational-genetic-genealogy/class_data/

# 3. Create empty directory placeholders
!aws s3api put-object --bucket computational-genetic-genealogy --key class_data/merged_opensnps_data_autosomes/
!aws s3api put-object --bucket computational-genetic-genealogy --key class_data/merged_opensnps_data_autosomes/phased_samples/
!aws s3api put-object --bucket computational-genetic-genealogy --key class_data/merged_opensnps_data_autosomes/segments/
!aws s3api put-object --bucket computational-genetic-genealogy --key class_data/merged_opensnps_data_autosomes/unphased_samples/

### Managing S3 Data

Commands for managing data in the S3 bucket:

In [None]:
# List all objects in the bucket
!aws s3 ls s3://computational-genetic-genealogy/ --recursive | head -20

# Count total objects
!aws s3 ls s3://computational-genetic-genealogy/ --recursive | wc -l

# Check if specific file exists
!aws s3api head-object --bucket computational-genetic-genealogy --key class_data/pedigree.fam

# Delete a specific file
!aws s3 rm s3://computational-genetic-genealogy/class_data/unwanted_file.txt

# Delete a directory
!aws s3 rm s3://computational-genetic-genealogy/old_data/ --recursive

### Testing S3 Access

Verify that files are accessible via HTTP:

In [None]:
# Test access to a file via curl
!curl -I https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/pedigree.fam

# View content of a small file
!curl -s https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/pedigree.fam | head -5

## 2. Google Colab Integration

To prepare notebooks for Google Colab, add the following code to detect Colab environment and set up data access.

In [None]:
# Code to add at the beginning of notebooks for Colab compatibility

def is_colab():
    """Check if the notebook is running in Google Colab"""
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Example usage:
if is_colab():
    print("Running in Google Colab environment")
    
    # Option 1: Mount Google Drive (if students want to save data)
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Option 2: Just create directories for this session
    !mkdir -p class_data
    !mkdir -p results
    
    # Set environment variables
    import os
    os.environ['DATA_DIR'] = '/content/class_data'
    os.environ['RESULTS_DIR'] = '/content/results'
else:
    print("Running in local environment")

### Code for Downloading S3 Data in Colab

Add this code to notebooks to download required data files:

In [None]:
# Code to download required data files from S3

def download_class_data(files, base_dir="class_data"):
    """Download required files from S3
    
    Args:
        files: List of filenames or tuples (s3_path, local_name)
        base_dir: Local directory to store files
    """
    import os
    
    # Create directory if needed
    os.makedirs(base_dir, exist_ok=True)
    
    # S3 bucket URL
    s3_base = "https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/"
    
    # Process each file
    for file_info in files:
        if isinstance(file_info, tuple):
            s3_path, local_name = file_info
        else:
            s3_path = local_name = file_info
            
        # Ensure subdirectories exist
        local_path = os.path.join(base_dir, local_name)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        # Download file
        !wget -q -O {local_path} {s3_base}{s3_path}
        print(f"Downloaded {s3_path} to {local_path}")

# Example usage
if is_colab():
    download_class_data([
        "pedigree.fam",
        "pedigree.def",
        "merged_opensnps_autosomes_ped_sim.seg",
        # Include subdirectory example
        ("raw_dna_profiles/user12214_file10061_yearofbirth_2001_sex_XY.ancestry.txt", 
         "raw_dna_profiles/user12214.ancestry.txt")
    ])

### Google Colab Embedding

Instructions for embedding Colab notebooks in HTML pages:

```html
<!-- Example of embedding a Colab notebook in an HTML page -->
<iframe
  src="https://colab.research.google.com/github/lakishadavid/computational_genetic_genealogy/blob/main/labs_v2/Lab01_IBD_and_Genealogy_Intro.ipynb"
  width="100%" height="800px">
</iframe>
```

Key parameters for the iframe URL:
- GitHub username: `lakishadavid` 
- Repository: `computational_genetic_genealogy`
- Branch: `main`
- Path to notebook: `labs_v2/Lab01_IBD_and_Genealogy_Intro.ipynb`

## 3. JupyterLite Integration

JupyterLite provides a fully client-side, in-browser Jupyter experience. It doesn't require server setup and can include data files directly in the build.

In [None]:
# View the JupyterLite build script
!cat scripts_support/build_jupyter.sh

### JupyterLite Embedding

Instructions for embedding JupyterLite notebooks in HTML pages:

```html
<!-- Example of embedding a JupyterLite notebook in an HTML page -->
<iframe
  src="jupyterlite_app/lab/index.html?path=Lab01_IBD_and_Genealogy_Intro.ipynb"
  width="100%" height="800px">
</iframe>
```

Key parameters:
- Base path to JupyterLite app: `jupyterlite_app/lab/index.html`
- Notebook path: `Lab01_IBD_and_Genealogy_Intro.ipynb`
- Additional options: `kernel=python&toolbar=1`

## 4. Template Code for Cross-Platform Compatibility

Use this template to make notebooks work across environments (local, Colab, JupyterLite):

In [None]:
# Cross-environment compatibility code

import os
import sys

def setup_environment():
    """Set up environment-specific paths and dependencies"""
    
    # Check for Google Colab
    in_colab = 'google.colab' in sys.modules
    
    # Check for JupyterLite
    in_jupyterlite = 'pyodide' in sys.modules
    
    if in_colab:
        print("Setting up Google Colab environment")
        
        # Create directories
        !mkdir -p class_data
        !mkdir -p results
        
        # Set paths
        data_dir = 'class_data'
        results_dir = 'results'
        
        # Download required data (customize for each notebook)
        s3_base = "https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/"
        files_to_download = ["pedigree.fam", "pedigree.def"]
        
        for file in files_to_download:
            !wget -q -O {data_dir}/{file} {s3_base}{file}
            print(f"Downloaded {file}")
            
    elif in_jupyterlite:
        print("Setting up JupyterLite environment")
        # In JupyterLite, data is pre-loaded in the files/ directory
        data_dir = 'class_data'
        results_dir = 'results'
        
    else:
        print("Setting up local environment")
        # Use environment variables or default paths
        data_dir = os.getenv('DATA_DIR', 'data/class_data')
        results_dir = os.getenv('RESULTS_DIR', 'results')
        
        # Ensure results directory exists
        os.makedirs(results_dir, exist_ok=True)
    
    return data_dir, results_dir

# Usage
DATA_DIR, RESULTS_DIR = setup_environment()

## 5. Notebook Template for S3 Data Access

Use this template for creating new lab notebooks with S3 data access:

In [None]:
# Template S3 data access code to include in notebooks

def download_s3_data():
    """Download required data files from S3 for this lab"""
    # Check if running in Google Colab
    try:
        import google.colab
        in_colab = True
    except ImportError:
        in_colab = False
        
    if not in_colab:
        print("Not running in Colab, skipping download")
        return
        
    # Create directories
    !mkdir -p class_data
    
    # S3 base URL
    S3_BASE = "https://computational-genetic-genealogy.s3.us-east-2.amazonaws.com/class_data/"
    
    # Files needed for this specific lab
    FILES = [
        "pedigree.fam",
        "pedigree.def",
        "merged_opensnps_autosomes_ped_sim.seg"
    ]
    
    # Download each file
    for file in FILES:
        !wget -q -O class_data/{file} {S3_BASE}{file}
        print(f"Downloaded {file}")
        
    print("All required data files downloaded successfully!")
    
# Download data when running in Colab
download_s3_data()