# Lab 2: Processing Raw DNA Profiles

This is the JupyterLite version of Lab 2, adapted to run entirely in your browser.

## Introduction

In this lab, we'll learn how to process raw DNA profiles from commercial testing companies. We'll practice with a simplified dataset to understand the fundamental concepts of DNA data processing, file format conversion, and genetic variant representation.

## Environment Setup

First, let's set up our environment by importing the necessary libraries:

In [None]:
import micropip
await micropip.install(['pandas', 'numpy', 'matplotlib', 'plotly'])

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyodide.http import open_url
from io import StringIO
import json

print("Environment setup complete!")

## Loading Data From Previous Lab

Let's check if we have data saved from Lab 1:

In [None]:
def load_from_storage(lab_id='lab1'):
    """Load previously saved data from browser storage"""
    try:
        from js import localStorage
        import json
        
        # Get data from localStorage
        stored_data = localStorage.getItem(f'{lab_id}_data')
        if stored_data:
            data_dict = json.loads(stored_data)
            
            # Convert JSON back to DataFrame
            if 'sample_df' in data_dict:
                data_dict['sample_df'] = pd.read_json(data_dict['sample_df'])
                
            print(f"Data from {lab_id} loaded successfully!")
            return data_dict
        else:
            print(f"No saved data found from {lab_id}.")
            return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load data from Lab 1
lab1_data = load_from_storage('lab1')

# If we have data from Lab 1, use the sample_df
if lab1_data and 'sample_df' in lab1_data:
    sample_df = lab1_data['sample_df']
    print(f"Using sample data from Lab 1 ({len(sample_df)} samples)")
    display(sample_df.head())
else:
    print("No data found from Lab 1, loading mock data instead...")
    # We'll load mock data as a fallback
    # This code would be executed if Lab 1 data isn't available

## Loading Sample Raw DNA Profile

Now let's load a sample raw DNA profile in a format similar to what consumer testing companies provide:

In [None]:
# Function to create a mock DNA profile
def create_mock_dna_profile(sample_size=50):
    """Create a mock DNA profile similar to consumer testing formats"""
    # Generate random chromosomes (weighted towards common ones)
    chromosomes = np.random.choice(
        ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 
         '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
        size=sample_size,
        p=[0.06, 0.06, 0.05, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
           0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.06, 0.02]
    )
    
    # Generate random positions (in sorted order for each chromosome)
    positions = []
    current_chrom = ''
    current_pos = 10000
    
    for chrom in chromosomes:
        if chrom != current_chrom:
            current_chrom = chrom
            current_pos = np.random.randint(100000, 1000000)
        else:
            current_pos += np.random.randint(1000, 100000)
        positions.append(current_pos)
    
    # Generate random rsIDs
    rsids = [f'rs{np.random.randint(100000, 9999999)}' for _ in range(sample_size)]
    
    # Generate random genotypes (with biologically plausible distributions)
    nucleotides = ['A', 'C', 'G', 'T']
    genotypes = []
    
    for _ in range(sample_size):
        # 70% homozygous, 30% heterozygous
        if np.random.random() < 0.7:
            # Homozygous
            allele = np.random.choice(nucleotides)
            genotypes.append(f'{allele}{allele}')
        else:
            # Heterozygous
            allele1, allele2 = np.random.choice(nucleotides, size=2, replace=False)
            genotypes.append(f'{allele1}{allele2}')
    
    # Create DataFrame
    df = pd.DataFrame({
        'rsid': rsids,
        'chromosome': chromosomes,
        'position': positions,
        'genotype': genotypes
    })
    
    # Sort by chromosome and position
    df = df.sort_values(['chromosome', 'position']).reset_index(drop=True)
    
    return df

# Create a mock DNA profile
dna_profile = create_mock_dna_profile(100)

# Display the first few rows
print("Sample raw DNA profile:")
display(dna_profile.head(10))

# Basic statistics
print("\nBasic profile statistics:")
print(f"Total variants: {len(dna_profile)}")
print("Variants per chromosome:")
display(dna_profile['chromosome'].value_counts().sort_index())

## Analyzing the Raw DNA Profile

Let's explore the structure and content of the raw DNA profile:

In [None]:
# Count genotype frequencies
genotype_counts = dna_profile['genotype'].value_counts()
print("Genotype frequencies:")
display(genotype_counts)

# Calculate heterozygosity rate
heterozygous_count = sum(dna_profile['genotype'].apply(lambda x: x[0] != x[1]))
heterozygosity_rate = heterozygous_count / len(dna_profile)
print(f"\nHeterozygosity rate: {heterozygosity_rate:.2%}")

# Visual distribution of genotypes
plt.figure(figsize=(10, 6))
genotype_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Genotypes in DNA Profile')
plt.xlabel('Genotype')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Converting to Standard Format (VCF-like)

Now, let's convert the raw DNA profile to a format that resembles the VCF (Variant Call Format) standard used in bioinformatics:

In [None]:
def convert_to_vcf_format(dna_df):
    """Convert raw DNA profile to VCF-like format"""
    # Create a new DataFrame for VCF format
    vcf_data = []
    
    for _, row in dna_df.iterrows():
        # Define reference and alternate alleles
        # For simplicity, we'll use the first allele as reference and the second as alternate if different
        genotype = row['genotype']
        ref_allele = genotype[0]
        alt_allele = genotype[1] if genotype[0] != genotype[1] else '.'
        
        # Determine GT field (0/0, 0/1, or 1/1)
        if alt_allele == '.':
            gt = '0/0'  # Homozygous reference
        elif genotype[0] == genotype[1]:
            gt = '1/1'  # Homozygous alternate
        else:
            gt = '0/1'  # Heterozygous
        
        # Create VCF-like entry
        vcf_entry = {
            'CHROM': row['chromosome'],
            'POS': row['position'],
            'ID': row['rsid'],
            'REF': ref_allele,
            'ALT': alt_allele,
            'QUAL': '.',
            'FILTER': 'PASS',
            'INFO': '.',
            'FORMAT': 'GT',
            'SAMPLE': gt
        }
        
        vcf_data.append(vcf_entry)
    
    # Create DataFrame from the list of VCF entries
    vcf_df = pd.DataFrame(vcf_data)
    
    return vcf_df

# Convert our DNA profile to VCF-like format
vcf_df = convert_to_vcf_format(dna_profile)

# Display the first few rows of the VCF-like format
print("VCF-like format:")
display(vcf_df.head(10))

## Visualization of Variant Distribution

Let's visualize the distribution of genetic variants across chromosomes:

In [None]:
# Count variants per chromosome
chrom_counts = dna_profile['chromosome'].value_counts().sort_index()

# Reorder chromosomes in natural order
def natural_sort_key(chrom):
    if chrom == 'X':
        return 23
    elif chrom == 'Y':
        return 24
    else:
        return int(chrom)

chrom_counts = chrom_counts.sort_index(key=lambda x: [natural_sort_key(c) for c in x])

# Create bar chart
plt.figure(figsize=(12, 6))
chrom_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Variants Across Chromosomes')
plt.xlabel('Chromosome')
plt.ylabel('Number of Variants')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Create ideogram-style visualization
# For simplicity, we'll use a scatter plot with chromosomes on y-axis and positions on x-axis
plt.figure(figsize=(14, 8))

# Convert chromosome to numeric for plotting
chrom_to_num = {}
for i, chrom in enumerate(sorted(dna_profile['chromosome'].unique(), key=natural_sort_key)):
    chrom_to_num[chrom] = i

# Extract data for plotting
y = [chrom_to_num[chrom] for chrom in dna_profile['chromosome']]
x = dna_profile['position']

# Create the scatter plot
plt.scatter(x, y, color='blue', alpha=0.7, s=20)

# Set y-ticks to chromosome names
plt.yticks(range(len(chrom_to_num)), list(chrom_to_num.keys()))

# Add labels and title
plt.title('Genomic Distribution of Variants')
plt.xlabel('Position on Chromosome')
plt.ylabel('Chromosome')

# Add gridlines
plt.grid(axis='y', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

## Simulating Relationship Data

To prepare for later labs, let's also simulate some IBD (Identity by Descent) segment data that would be used to infer relationships:

In [None]:
def simulate_ibd_segments(relationship_type, num_segments=10):
    """Simulate IBD segments for different relationship types"""
    # Define parameters for different relationship types
    relationship_params = {
        'parent-child': {'mean_length': 70, 'std_length': 30, 'min_segments': 22, 'max_segments': 25},
        'full-sibling': {'mean_length': 50, 'std_length': 30, 'min_segments': 22, 'max_segments': 25},
        'half-sibling': {'mean_length': 30, 'std_length': 20, 'min_segments': 15, 'max_segments': 20},
        'first-cousin': {'mean_length': 15, 'std_length': 10, 'min_segments': 8, 'max_segments': 15},
        'second-cousin': {'mean_length': 8, 'std_length': 5, 'min_segments': 3, 'max_segments': 10},
        'third-cousin': {'mean_length': 5, 'std_length': 3, 'min_segments': 1, 'max_segments': 5},
        'unrelated': {'mean_length': 3, 'std_length': 1, 'min_segments': 0, 'max_segments': 2}
    }
    
    if relationship_type not in relationship_params:
        raise ValueError(f"Unknown relationship type: {relationship_type}")
    
    params = relationship_params[relationship_type]
    
    # Determine number of segments
    if relationship_type == 'unrelated':
        num_segments = np.random.randint(0, 3)  # Mostly 0, sometimes 1-2 by chance
    else:
        num_segments = np.random.randint(params['min_segments'], params['max_segments'] + 1)
    
    # Generate segments
    segments = []
    
    for _ in range(num_segments):
        # Choose a random chromosome
        chrom = np.random.choice(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
                                 '13', '14', '15', '16', '17', '18', '19', '20', '21', '22'])
        
        # Generate segment length (in cM)
        length = max(1, np.random.normal(params['mean_length'], params['std_length']))
        
        # Generate random start and end positions
        # For simplicity, we'll use arbitrary genomic positions
        start_pos = np.random.randint(1000000, 100000000)
        end_pos = start_pos + int(length * 1000000)  # Rough conversion from cM to bp
        
        segments.append({
            'chromosome': chrom,
            'start_position': start_pos,
            'end_position': end_pos,
            'length_cM': round(length, 2),
            'SNPs': np.random.randint(100, 5000)
        })
    
    # Create DataFrame
    ibd_df = pd.DataFrame(segments)
    
    # Calculate total cM
    total_cm = round(ibd_df['length_cM'].sum(), 2) if len(ibd_df) > 0 else 0
    
    return ibd_df, total_cm

# Simulate IBD segments for different relationship types
relationship_types = ['parent-child', 'full-sibling', 'first-cousin', 'second-cousin', 'third-cousin', 'unrelated']

# Create a summary table
summary_data = []

for rel_type in relationship_types:
    ibd_df, total_cm = simulate_ibd_segments(rel_type)
    summary_data.append({
        'Relationship': rel_type,
        'Number of Segments': len(ibd_df),
        'Total cM': total_cm,
        'Average Segment Length': round(ibd_df['length_cM'].mean(), 2) if len(ibd_df) > 0 else 0
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

# Visualize the results
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Relationship'], summary_df['Total cM'], color='skyblue')
plt.title('Total IBD Sharing by Relationship Type')
plt.xlabel('Relationship Type')
plt.ylabel('Total Shared cM')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Saving Progress

Let's save our work for use in future labs:

In [None]:
def save_to_storage(data_dict, lab_id='lab2'):
    """Save data to browser storage for later use"""
    try:
        from js import localStorage
        import json
        
        # Convert DataFrames to JSON
        data_copy = {}
        for key, value in data_dict.items():
            if isinstance(value, pd.DataFrame):
                data_copy[key] = value.to_json()
            else:
                data_copy[key] = value
            
        # Save to localStorage
        localStorage.setItem(f'{lab_id}_data', json.dumps(data_copy))
        print(f"Data saved successfully to {lab_id}_data!")
        return True
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return False

# Save our current data
save_to_storage({
    'dna_profile': dna_profile,
    'vcf_data': vcf_df,
    'relationship_summary': summary_df,
    'lab_complete': True,
    'timestamp': pd.Timestamp.now().isoformat()
})

## Conclusion

In this lab, we've learned how to:

1. Work with raw DNA profile data similar to what consumer testing companies provide
2. Analyze the structure and content of DNA profiles
3. Convert raw DNA data to a standardized VCF-like format
4. Visualize the distribution of genetic variants across the genome
5. Understand the relationship between genetic sharing and genealogical relationships

These skills form the foundation for more advanced genetic genealogy analyses in the upcoming labs.