# Environment Setup

In [2]:
import os
import requests
import pandas as pd
import urllib3
from collections import Counter
from pathlib import Path
from dotenv import load_dotenv
# Suppress SSL verification warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [3]:
# Environment setup for cross-compatibility
from scripts_support.lab_cross_compatibility import setup_environment, is_jupyterlite

# Set up environment-specific paths
DATA_DIR, RESULTS_DIR = setup_environment()

# Now you can use DATA_DIR and RESULTS_DIR consistently across environments


Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env


In [4]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils


## Introduction to the IGSR 30x GRCh38 Data Collection
The International Genome Sample Resource (IGSR) provides a data collection for the 30x GRCh38 human genome assembly. This resource is invaluable for researchers and scientists who are working on genomics, as it offers high-quality, publicly available data sets. The 30x coverage ensures a high level of accuracy and reliability for genomic studies.

## How to Download Populations File and Sample File

- https://www.internationalgenome.org/data-portal/data-collection/30x-grch38
- https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/
- Sample Info = https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/
- sample Coordinates = https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20161101_phase3_pop_coords/

In [5]:
# Direct URL to the sample summary file
sample_summary_url = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20140502_sample_summary_info/20140502_complete_sample_summary.txt"

# Define output file path
output_file = os.path.join(references_directory, "20140502_complete_sample_summary.txt")

print(f"Attempting to download sample summary data from: {sample_summary_url}")
print(f"Will save to: {output_file}")

# Make request for sample summary data
try:
   response = requests.get(sample_summary_url, verify=False)
   print(f"\nResponse status code: {response.status_code}")
   
   # Check if request was successful
   if response.status_code == 200:
       # Save the data
       with open(output_file, "w", encoding='utf-8') as f:
           f.write(response.text)
       
       # Verify file was saved and get its size
       if os.path.exists(output_file):
           file_size = os.path.getsize(output_file)
           print(f"\nFile saved successfully!")
           print(f"Location: {output_file}")
           print(f"File size: {file_size:,} bytes")
           
           # Print first few lines to verify content
           print("\nFirst few lines of downloaded file:")
           with open(output_file, 'r') as f:
               print(f.readline().strip())  # Header line
               print(f.readline().strip())  # First data line
   else:
       print(f"Error: Got status code {response.status_code}")
       print(f"Error message: {response.text}")

except Exception as e:
   print(f"An error occurred: {str(e)}")

print("\nDownload attempt completed.")

Attempting to download sample summary data from: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20140502_sample_summary_info/20140502_complete_sample_summary.txt
Will save to: /home/lakishadavid/computational_genetic_genealogy/references/20140502_complete_sample_summary.txt

Response status code: 200

File saved successfully!
Location: /home/lakishadavid/computational_genetic_genealogy/references/20140502_complete_sample_summary.txt
File size: 787,798 bytes

First few lines of downloaded file:
Sample	Family ID	Population	Population Description	Gender	Relationship	Unexpected Parent/Child 	Non Paternity	Siblings	Grandparents	Avuncular	Half Siblings	Unknown Second Order	Third Order	Other Comments	In Low Coverage Pilot	LC Pilot Platforms	LC Pilot Centers	In High Coverage Pilot	HC Pilot Platforms	HC Pilot Centers	In Exon Targetted Pilot	ET Pilot Platforms	ET Pilot Centers	Has Sequence in Phase1	Phase1 LC Platform	Phase1 LC Centers	Phase1 E Platform	Phase1 E Centers	In Phase1 I

## Exploring the Sample and Population Files

Before diving into the analysis, it's a good idea to explore the sample and population files to get a sense of what the data looks like. We'll use Pandas to open these files and display the first few rows.


In [None]:
# Import the Pandas library
import pandas as pd

# Load the sample and population files into Pandas DataFrames
sample_file_name = os.path.join(references_directory, "20140502_complete_sample_summary.txt")
try:
    sample_df = pd.read_csv(sample_file_name, sep='\t')
except FileNotFoundError as e:
    print(f"File not found: {e}")

print(sample_df.columns)

# Understanding the 1000 Genomes Project Sample Summary File in Genetic Genealogy

The 1000 Genomes Project sample summary file provides comprehensive information about individuals whose genomes were sequenced as part of this landmark project. For genetic genealogists, this data is particularly valuable for understanding population references and sample quality.

## Key Components for Genetic Genealogy

### Sample and Population Information
- **Sample**: Unique identifier for each individual
- **Family ID**: Family grouping information
- **Population**: Population code (e.g., GBR for British)
- **Population Description**: Detailed description of the population group
- **Gender**: Biological sex of the sample

### Family Relationships
The file includes detailed relationship information that's crucial for genetic genealogy:
- Unexpected Parent/Child relationships
- Non-Paternity cases
- Siblings
- Grandparents
- Avuncular (aunt/uncle) relationships
- Half Siblings
- Second and Third Order relationships

### Sequencing Quality Metrics
Important metrics that help assess the reliability of genetic matches:
- Coverage information (how well the genome was sequenced)
- Quality control indicators
- Verification scores for different genotyping platforms

### Technical Details for Advanced Analysis
- Chromosome-specific data (Y chromosome and mitochondrial DNA)
- Sequencing platform information
- Coverage statistics
- Quality control results

## Relevance to Genetic Genealogy Research

1. **Population References**: 
   - Helps understand genetic ancestry composition
   - Provides reference populations for ethnicity estimates

2. **Sample Quality Assessment**:
   - Indicates reliability of genetic matches
   - Helps evaluate the confidence level of genetic relationships

3. **Family Structure Verification**:
   - Useful for understanding how relationships are detected
   - Provides examples of verified family relationships

4. **Research Quality**:
   - Coverage metrics help assess the completeness of genetic information
   - Quality control indicators support confidence in genetic matches

This dataset serves as a fundamental resource for understanding how genetic relationships are validated and categorized in professional genetic genealogy research.

In [None]:
# Let's first look at basic sample information
# Select columns related to sample identification and population
basic_cols = ['Sample', 'Family ID', 'Population', 'Population Description', 'Gender']
basic_info = sample_df[basic_cols]

print("First few rows of basic sample information:")
display(basic_info.head())

print("\nSummary of populations in the dataset:")
population_counts = sample_df['Population'].value_counts()
display(population_counts)

print("\nUnique population descriptions:")
display(pd.DataFrame(sample_df[['Population', 'Population Description']].drop_duplicates()))

In [None]:
# Block 1: Basic Population Exploration
print("Exploring population distributions in the 1000 Genomes data")
print("-" * 50)

# Total number of samples
total_samples = len(sample_df)
print(f"Total number of samples: {total_samples:,}")

# Create a summary of samples by gender within each population
pop_gender_summary = pd.crosstab(sample_df['Population'], sample_df['Gender'])
print("\nGender distribution by population:")
display(pop_gender_summary)

In [None]:
# Explore family relationships
# Select columns related to family relationships
relationship_cols = ['Sample', 'Family ID', 'Population', 'Relationship', 
                    'Siblings', 'Grandparents', 'Avuncular', 'Half Siblings',
                    'Unknown Second Order', 'Third Order']
family_info = sample_df[relationship_cols]

print("Samples with recorded family relationships:")
# Count how many samples have each type of relationship
relationship_counts = family_info.iloc[:, 3:].notna().sum()
display(relationship_counts)

# Show examples of families with relationships
print("\nExamples of families with recorded relationships:")
display(family_info[family_info.iloc[:, 3:].notna().any(axis=1)].head())

In [None]:
# Explore sequencing quality metrics
quality_cols = ['Sample', 'Population', 'Total LC Sequence', 
                'LC Non Duplicated Aligned Coverage',
                'Total Exome Sequence', '% Targets Covered to 20x or greater',
                'LC Passed QC', 'E Passed QC']
quality_info = sample_df[quality_cols]

print("Summary of sequencing quality metrics:")
display(quality_info.describe())

# Look at QC pass rates by population
print("\nQC pass rates by population:")
qc_by_pop = sample_df.groupby('Population')[['LC Passed QC', 'E Passed QC']].mean()
display(qc_by_pop)

In [None]:
# Explore special genetic markers (Y chromosome and mitochondrial DNA)
marker_cols = ['Sample', 'Population', 'Gender',
               'Has Phase1 chrY SNPS', 'Has phase1 chrY Deletions',
               'Has phase1 chrMT SNPs']
marker_info = sample_df[marker_cols]

print("Summary of genetic marker availability:")
# Count samples with each type of marker
marker_counts = marker_info.iloc[:, 3:].sum()
display(marker_counts)

# Show distribution by gender
print("\nMarker availability by gender:")
marker_by_gender = marker_info.groupby('Gender')[['Has Phase1 chrY SNPS', 
                                                 'Has phase1 chrY Deletions',
                                                 'Has phase1 chrMT SNPs']].mean()
display(marker_by_gender)

In [None]:
# Create a summary of key metrics by population
summary_cols = ['Population', 'Gender', 'LC Passed QC', 'E Passed QC',
                'Has Phase1 chrY SNPS', 'Has phase1 chrMT SNPs']
population_summary = sample_df[summary_cols].groupby('Population').agg({
    'Gender': 'count',  # Count total samples
    'LC Passed QC': 'mean',  # Average QC pass rate
    'E Passed QC': 'mean',
    'Has Phase1 chrY SNPS': 'sum',  # Count samples with Y SNPs
    'Has phase1 chrMT SNPs': 'sum'  # Count samples with MT SNPs
}).round(3)

# Rename columns for clarity
population_summary.columns = ['Sample Count', 'LC QC Pass Rate', 'E QC Pass Rate',
                            'Samples with Y SNPs', 'Samples with MT SNPs']

print("Summary statistics by population:")
display(population_summary)

In [None]:
# Filtering for Specific Populations
# Example: Looking at British (GBR) samples
print("Exploring British (GBR) samples")
print("-" * 50)

target_population = 'GBR'
gbr_samples = sample_df[sample_df['Population'] == target_population]

print(f"Number of GBR samples: {len(gbr_samples)}")
print("\nBasic information about GBR samples:")
display(gbr_samples[['Sample', 'Gender', 'Family ID']].head())

# Check for family relationships in GBR
gbr_families = gbr_samples[gbr_samples['Family ID'].duplicated(keep=False)]
if len(gbr_families) > 0:
    print("\nFound family groups in GBR samples:")
    display(gbr_families[['Sample', 'Family ID', 'Relationship']])

In [None]:
# Advanced Filtering with Multiple Criteria
print("Advanced sample filtering example")
print("-" * 50)

# Example: Find high-quality samples from specific populations
target_populations = ['CEU', 'YRI', 'CHB']
quality_threshold = 0.9  # 90% coverage

# Total number of samples in the target populations
total_samples = sample_df[sample_df['Population'].isin(target_populations)]
print(f"Total samples in target populations: {len(total_samples)}")

# Filter for high-quality samples
high_quality_samples = sample_df[
    (sample_df['Population'].isin(target_populations)) &
    (sample_df['LC Passed QC'] == 1) &
    (sample_df['E Passed QC'] == 1) &
    (sample_df['% Targets Covered to 20x or greater'] >= quality_threshold)
]

# Add a column for the total number of samples in each population
population_totals = total_samples.groupby('Population').size().reset_index(name='Total Samples')

# Add a column for the number of high-quality samples in each population
high_quality_counts = high_quality_samples.groupby('Population').size().reset_index(name='Passed Filters')

# Merge the two DataFrames to show totals and passed filters
summary_df = pd.merge(population_totals, high_quality_counts, on='Population', how='left').fillna(0)

# Display the summary
print("\nSummary of high-quality samples:")
print(summary_df)

### Filtering Data
You can filter rows based on certain conditions. For example, let's filter the sample DataFrame to only include individuals from a specific population.

In [None]:
# Filter to include only individuals from the 'YRI' population
yri_population = family_info[family_info['Population'] == 'YRI']
yri_population.head()

In [None]:
yri_population.info()

In [None]:
yri_population[["Siblings", "Grandparents", "Avuncular", "Half Siblings", "Unknown Second Order", "Third Order"]].describe()

In [None]:
# Search for individuals with specific attributes
specific_entries = basic_info[(basic_info['Population'] == 'YRI') & (basic_info['Gender'] == 'female')]
specific_entries.info()

In [None]:
specific_entries.head()

In [None]:
specific_entries.describe()

### Multiple Conditions
You can include multiple conditions in your query. For example, let's find all females in either the 'YRI' or 'CEU' populations.

In [None]:
# Search for females in either 'YRI' or 'CEU' populations
multiple_conditions = basic_info[(basic_info['Population'].isin(['YRI', 'CEU'])) & (basic_info['Gender'] == 'female')]
multiple_conditions.describe()

In [None]:
target_population = 'YRI'
target_sex = 'female'

# Search using variables
variable_filter = basic_info[(basic_info['Population'] == target_population) & (basic_info['Gender'] == target_sex)]
variable_filter.describe()

### Searching for Entries in a List
If you have a list of values to search for, you can use the isin() method within .query().

In [None]:
# List of target populations
target_populations = ['YRI', 'CEU']

# Search for individuals in target populations
list_filter = basic_info[basic_info['Population'].isin(target_populations)]
list_filter.describe()

### Using String Methods
You can also use string methods to search for specific patterns in string columns.

In [None]:
# Search for individuals whose sample IDs start with 'NA'
string_filter = basic_info[basic_info['Sample'].str.startswith('NA')]
print(f'There are {len(string_filter)} that starts wtih "NA"')

print(f"Here are a few rows...")
string_filter.head()

## Downloading 1000 Genomes Data

Up to this point, we have explored the sample and population metadata files to understand the structure of our data. However, we have not yet downloaded the actual genetic data from the 1000 Genomes Project.

(Optional: Check out the page where we will download the files: https://www.internationalgenome.org/data-portal/data-collection/30x-grch38. Look in the description text for "Phased VCFs". This will take you to the ftp page where we get our links for our code. You can also manually download the files from here.)

In the next step, we will download the VCF (Variant Call Format) files for chromosome 21.

Note: The script will also check if the directory for storing the 1000 Genomes reference panel exists. If not, it will create one for you.

Let's proceed to download the data for chromosome 21.

#### Download 1000 Genomes Chromosome 21, if you haven't done so already.
file size: 407 M, less than 10 minutes

In [None]:
%%bash -s "$references_directory"

# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

# Check if the onekg_reference_panel directory exists; if not, create it
if [ ! -d "${onekg_reference_panel_dir}" ]; then
    echo "Creating onekg_reference_panel directory..."
    mkdir -p "${onekg_reference_panel_dir}"
fi

echo

# Download the VCF file for chromosome 21 from the 1000 Genomes FTP site
echo "Downloading chromosome 21..."
    
wget --continue --retry-connrefused --timeout=60 --waitretry=60 --tries=20 --progress=bar:force \
    https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz \
    -O ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

bcftools index -t ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

### Let's explore the data!

In [None]:
%%bash -s "$references_directory"

# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

echo "View the Header Information"
bcftools view -h ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz


In [None]:
%%bash -s "$references_directory"

# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

echo "Get the number of samples"
bcftools query -l ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | wc -l

echo "Count the Number of Variants"
bcftools view -H ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | wc -l

echo "Get the stats"
bcftools stats ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

In [None]:
%%bash -s "$references_directory"

# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

echo "View the First Few Variants"
bcftools view -H ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | head -n 10

echo "Get the SNPs (10 of them)"
bcftools view -v snps ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | head -n 10

echo "Get Allele Frequencies"
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AF\n' ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | head -n 10

echo "Get the variants for a specific region"
bcftools view -r 21:20000000-21000000 ${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz | head -n 10

## Subsetting our data

### Why Is It Necessary?

As we dive deeper into genetic data analysis, it's important to manage computational resources effectively. Whole-genome VCF files from projects like the 1000 Genomes can be extremely large, often containing data for thousands of individuals across millions of genetic variants. Loading such extensive data into memory can be computationally intensive and may even cause the kernel to crash, as we've experienced.

### Advantages of Subsetting

1. **Reduced Computational Load**: By focusing on a subset of 500 individuals, we significantly reduce the computational resources needed for the analysis.

2. **Faster Execution**: Smaller datasets mean that code will execute more quickly, allowing us to focus on the analysis rather than waiting for code to run.

3. **Feasibility**: Not all personal computers will have the resources to handle large genomic datasets. Subsetting makes the tutorial more accessible.

4. **Focused Analysis**: With fewer individuals, it's easier to explore the data in depth, which is particularly useful for educational purposes.

## Subsetting based on population

In [None]:
%%bash -s "$references_directory" "$results_directory"

# Receive directory variables from Python
references_directory=$1
results_directory=$2

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir=${references_directory}/onekg_reference_panel

# Define the path to the metadata file
onekg_metadata_file=${references_directory}/20140502_complete_sample_summary.txt

# Define the populations to subset
populations=("ASW" "ACB")

chromosome=21

# Define the input VCF file path
input_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

# Define the output VCF file path
output_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz

# Create a temporary file to store the sample names
sample_file=${onekg_reference_panel_dir}/samples_${populations[*]}.txt

# Extract sample names for the specified populations from the metadata file
grep -E "$(IFS='|'; echo "${populations[*]}")" "${onekg_metadata_file}" | cut -f 1 > "${sample_file}"

# Subset the VCF file for the specified populations
bcftools view -S "${sample_file}" --force-samples -Oz -o "${output_vcf}" "${input_vcf}" 2>/dev/null

# Index the subsetted VCF file
tabix -p vcf "${output_vcf}"

echo "Get the number of samples"
bcftools query -l "${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz" | wc -l

echo "Count the Number of Variants"
bcftools view -H "${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz" | wc -l

echo "Get the stats"
bcftools stats "${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz"


## Subsetting to 500 Individuals

### Random Selection

In the next cell, we will use `bcftools` to subset our VCF file to include only these 500 randomly selected individuals. This will make subsequent analyses more manageable and less resource-intensive.

In [None]:
%%bash -s "$references_directory"
# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

# Define the populations
populations=("ASW" "ACB")

chromosome=21

# Define the input VCF file path
input_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

# Extract sample names from the subsetted VCF file
bcftools query -l "${input_vcf}" > "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}.txt"

# Randomly select 500 sample names
shuf -n 500 "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}.txt" > "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}.txt"

# Subset the VCF file based on the random 500 sample IDs
bcftools view -S "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}.txt" -Oz -o "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_random500.vcf.gz" "${input_vcf}" 2>/dev/null

# Index the subsetted VCF file
tabix -p vcf "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_random500.vcf.gz"

echo "Get the number of samples"
bcftools query -l "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_random500.vcf.gz" | wc -l

echo "Count the Number of Variants"
bcftools view -H "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_random500.vcf.gz" | wc -l

echo "Get the stats"
bcftools stats "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_random500.vcf.gz"
