# Lab 1: Exploring Genetic Data

This is the JupyterLite version of Lab 1, adapted to run entirely in your browser. 

## Introduction

In this lab, we'll explore genetic data from the 1000 Genomes Project. We'll learn how to load, filter, and analyze genetic sample information to understand population structures.

## Environment Setup

First, let's set up our environment by importing the necessary libraries:

In [None]:
import micropip
await micropip.install(['pandas', 'numpy', 'matplotlib', 'plotly'])

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyodide.http import open_url
from collections import Counter

print("Environment setup complete!")

## Loading Sample Data

Instead of accessing files from the local filesystem, we'll load a preprocessed sample dataset through a URL:

In [None]:
# Function to load data from a URL
def load_sample_data():
    """Load sample summary data from a URL"""
    try:
        # URL to sample data (would be replaced with your actual hosted file)
        url = "https://raw.githubusercontent.com/internationagenome/sample-summaries/main/sample_data.tsv"
        
        # For this example, we'll use mock data
        mock_data = """
Sample	Family ID	Population	Population Description	Gender	Relationship
HG00096	HG00096	GBR	British in England and Scotland	male	
HG00097	HG00097	GBR	British in England and Scotland	female	
HG00099	HG00099	GBR	British in England and Scotland	female	
HG00100	HG00100	GBR	British in England and Scotland	female	
HG00101	HG00101	GBR	British in England and Scotland	male	
NA18486	NA18486	YRI	Yoruba in Ibadan, Nigeria	female	parent
NA18487	NA18487	YRI	Yoruba in Ibadan, Nigeria	male	parent
NA18488	NA18488	YRI	Yoruba in Ibadan, Nigeria	female	child
NA18489	NA18489	YRI	Yoruba in Ibadan, Nigeria	male	parent
NA18498	NA18498	YRI	Yoruba in Ibadan, Nigeria	female	parent
NA18499	NA18499	YRI	Yoruba in Ibadan, Nigeria	male	child
NA19625	NA19625	ASW	African Ancestry in Southwest US	female	parent
NA19626	NA19626	ASW	African Ancestry in Southwest US	male	parent
NA19627	NA19627	ASW	African Ancestry in Southwest US	female	child
NA19701	NA19701	ASW	African Ancestry in Southwest US	male	parent
NA19702	NA19702	ASW	African Ancestry in Southwest US	female	parent
NA19703	NA19703	ASW	African Ancestry in Southwest US	male	child
NA19704	NA19704	ASW	African Ancestry in Southwest US	female	child
NA19819	NA19819	ASW	African Ancestry in Southwest US	female	parent
NA19834	NA19834	ASW	African Ancestry in Southwest US	male	parent
"""
        
        # Load the data into a pandas DataFrame
        from io import StringIO
        return pd.read_csv(StringIO(mock_data), sep='\t')
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the sample data
sample_df = load_sample_data()

# Display the first few rows
if sample_df is not None:
    print(f"Loaded {len(sample_df)} samples")
    display(sample_df.head())
else:
    print("Failed to load sample data")

## Exploring the Sample Data

Let's explore the sample data to better understand the populations and relationships:

In [None]:
# Select columns related to sample identification and population
basic_cols = ['Sample', 'Family ID', 'Population', 'Population Description', 'Gender']
basic_info = sample_df[basic_cols]

print("First few rows of basic sample information:")
display(basic_info.head())

print("\nSummary of populations in the dataset:")
population_counts = sample_df['Population'].value_counts()
display(population_counts)

print("\nUnique population descriptions:")
display(pd.DataFrame(sample_df[['Population', 'Population Description']].drop_duplicates()))

## Visualizing Population Distribution

Let's create a simple visualization of population distribution in our sample:

In [None]:
# Create a bar chart of population counts
plt.figure(figsize=(10, 6))
population_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Samples by Population')
plt.xlabel('Population')
plt.ylabel('Number of Samples')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create a pie chart showing gender distribution
gender_counts = sample_df['Gender'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightpink'])
plt.title('Gender Distribution in Sample')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.tight_layout()
plt.show()

## Filtering Data

Let's practice filtering the data to focus on specific populations or attributes:

In [None]:
# Filter to include only individuals from the 'YRI' population
yri_population = sample_df[sample_df['Population'] == 'YRI']
print(f"Number of YRI samples: {len(yri_population)}")
display(yri_population)

# Filter to include only females in the dataset
females = sample_df[sample_df['Gender'] == 'female']
print(f"Number of female samples: {len(females)}")
display(females.head())

# Filter with multiple conditions (e.g., 'YRI' population AND 'female' gender)
yri_females = sample_df[(sample_df['Population'] == 'YRI') & (sample_df['Gender'] == 'female')]
print(f"Number of YRI female samples: {len(yri_females)}")
display(yri_females)

## Analyzing Family Relationships

Let's explore family relationships in our dataset:

In [None]:
# Look at relationship data
print("Distribution of relationship types:")
relationship_counts = sample_df['Relationship'].value_counts(dropna=False)
display(relationship_counts)

# Identify family groups
print("\nIdentifying family groups:")
family_groups = sample_df[sample_df.duplicated(subset=['Family ID'], keep=False)].sort_values('Family ID')
display(family_groups)

# Count number of families
unique_families = sample_df[sample_df['Relationship'].notna()]['Family ID'].nunique()
print(f"\nNumber of families with relationship information: {unique_families}")

## Saving Your Progress

In this browser environment, we can save our work to browser storage:

In [None]:
# Create a function to save data to browser storage
def save_to_storage(data_dict):
    """Save data to browser storage for later use"""
    try:
        from js import localStorage
        import json
        
        # Convert DataFrame to JSON
        if 'sample_df' in data_dict and isinstance(data_dict['sample_df'], pd.DataFrame):
            data_dict['sample_df'] = data_dict['sample_df'].to_json()
            
        # Save to localStorage
        localStorage.setItem('lab1_data', json.dumps(data_dict))
        print("Data saved successfully!")
        return True
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return False

# Create a function to load data from browser storage
def load_from_storage():
    """Load previously saved data from browser storage"""
    try:
        from js import localStorage
        import json
        
        # Get data from localStorage
        stored_data = localStorage.getItem('lab1_data')
        if stored_data:
            data_dict = json.loads(stored_data)
            
            # Convert JSON back to DataFrame
            if 'sample_df' in data_dict:
                data_dict['sample_df'] = pd.read_json(data_dict['sample_df'])
                
            print("Data loaded successfully!")
            return data_dict
        else:
            print("No saved data found.")
            return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Save our current data
save_to_storage({
    'sample_df': sample_df,
    'lab_complete': True,
    'timestamp': pd.Timestamp.now().isoformat()
})

## Conclusion

In this lab, we explored genetic sample data from the 1000 Genomes Project. We learned how to:

1. Load and inspect sample data
2. Filter data based on population and other attributes
3. Visualize population distributions
4. Identify family relationships
5. Save our progress to browser storage

This provides a foundation for the next labs, where we'll explore more complex genetic analyses.