# Environment Setup

In [None]:
import os
from decouple import config

In [None]:
working_directory = config('PROJECT_WORKING_DIR', default=None)
data_directory = config('PROJECT_DATA_DIR', default=None)
references_directory = config('PROJECT_REFERENCES_DIR', default=None)
results_directory = config('PROJECT_RESULTS_DIR', default=None)
utils_directory = config('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

In [None]:
if primary_directory.startswith('/content'):

  from google.colab import drive

  drive.mount('/content/drive')

In [None]:
# Define directories
if primary_directory.startswith('/content'):
    use_directory = "/content/use"
else:
    use_directory = os.path.join(primary_directory, "use")

results_directory = os.path.join(primary_directory, "results")
references_directory = os.path.join(primary_directory, "references")
data_directory = os.path.join(primary_directory, "data")

# Directories to check
directories = [use_directory, results_directory, references_directory, data_directory]

# Check if the directories exist and print a message
for directory in directories:
    if os.path.exists(directory):
        print(f"Directory exists: {directory}")
    else:
        os.makedirs(directory)
        print(f"Directory created: {directory}")

In [None]:
# Function to check if a package is installed
def check_install_package(package_name, pip_name=None):
    package_spec = importlib.util.find_spec(package_name)
    if package_spec is None:
        print(f"{package_name} not found. Installing...")
        !pip install {pip_name if pip_name else package_name}
    else:
        print(f"{package_name} is already installed.")

check_install_package('pandas', 'pandas')

## Install bcftools +

In [None]:
%%bash -s "$primary_directory" "$use_directory"

primary_directory=$1
use_directory=$2

# Install dependencies
sudo apt-get update
sudo apt-get install -y tabix
sudo apt-get install -y bcftools

cd $use_directory
git clone --recurse-submodules https://github.com/samtools/htslib.git
git clone https://github.com/samtools/bcftools.git

cd ${use_directory}/bcftools
make
export BCFTOOLS_PLUGINS=${use_directory}/bcftools/plugins

cd $primary_directory

## Introduction to the IGSR 30x GRCh38 Data Collection
The International Genome Sample Resource (IGSR) provides a data collection for the 30x GRCh38 human genome assembly. This resource is invaluable for researchers and scientists who are working on genomics, as it offers high-quality, publicly available data sets. The 30x coverage ensures a high level of accuracy and reliability for genomic studies.

## Upload the metadata

Upload your onethousandgenomes_metadata.zip file and place it in your references directory.

In [None]:
import os
import zipfile

# Specify the path to the zip file
zip_file_path = os.path.join(references_directory, "onethousandgenomes_metadata.zip")

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zipf:
    zipf.extractall(references_directory)

# Delete the zip file
os.remove(zip_file_path)

print("Zip file extracted and deleted successfully.")

**Although you already have the metadata, the instructions for where to find the metadata from the source is provided below for your reference. You do not have to use those instructions. If you ran the code above, you can skip to File Verification.**

## How to Download Populations File and Sample File

Normally, you would find options to download various data files, including the populations file and the sample file, on the IGSR data portal. Here's a general outline of how you could download these files and save them to your references_directory:

Navigate to the Data Download Section: Go to the IGSR data portal at https://www.internationalgenome.org/data-portal/data-collection/30x-grch38. Locate the buttons that says "Download the list" in the sample section and the population section.

Download Files: Click on the download links for the populations file and the sample file. Note the directory where these files are saved and the filenames.

Move Files to references_directory: Move these downloaded files to your reference directory.

## File Verification

Before proceeding with the data subsetting, let's ensure that the sample and population files you intend to use are available in the `references_directory`.

In [None]:
import os

sample_file_name = os.path.join(references_directory, "samples_igsr_1000genomes_grch38.tsv")

# Check if the file exists
if os.path.exists(sample_file_name):
    print(f"Sample file exists: {sample_file_name}")
else:
    print(f"File does not exist: {sample_file_name}")


population_file_name = os.path.join(references_directory, "populations_igsr_1000genomes_grch38.tsv")

# Check if the file exists
if os.path.exists(population_file_name):
    print(f"Population file exists: {population_file_name}")
else:
    print(f"File does not exist: {population_file_name}")

## Exploring the Sample and Population Files

Before diving into the analysis, it's a good idea to explore the sample and population files to get a sense of what the data looks like. We'll use Pandas to open these files and display the first few rows.


In [None]:
# Import the Pandas library
import pandas as pd

# Load the sample and population files into Pandas DataFrames
try:
    sample_df = pd.read_csv(sample_file_name, sep='\t')
    population_df = pd.read_csv(population_file_name, sep='\t')
except FileNotFoundError as e:
    print(f"File not found: {e}")

## Data Files Overview

Before diving into the analysis, it's crucial to understand the data files we'll be working with.

---

### Populations File

#### Description
The populations file contains information about the various populations that are part of the genomic study. This file is essential for understanding the diversity of the samples and for performing population-specific analyses.

#### Typical Columns
- **Population ID**: Unique identifier for each population.
- **Population Name**: Name of the population.
- **Region**: Geographical region where the population is located.
- **Number of Samples**: Number of samples collected from this population.
- **Other Metadata**: Additional information such as ethnicity, age range, etc.

#### Use Cases
- Filtering genomic data based on specific populations.
- Performing population-specific genetic variation analyses.
- Understanding the distribution of samples across different populations.

In [None]:
print("First few rows of the population file:")
display(population_df.head())

### Sample File

#### Description
The sample file contains detailed information about each individual sample that is part of the study. This file is essential for tracking the source of each genomic sequence and for associating it with specific traits or conditions.

#### Typical Columns
- **Sample ID**: Unique identifier for each sample.
- **Population ID**: The population to which the sample belongs.
- **Gender**: Gender of the individual from whom the sample was taken.
- **Age**: Age of the individual.
- **Health Status**: Information about the health condition of the individual, if applicable.
- **Other Metadata**: Additional information such as the date of sample collection, sequencing technology used, etc.

#### Use Cases
- Filtering genomic data based on specific samples or traits.
- Performing individual-level analyses.
- Associating genomic variations with specific traits or conditions.

In [None]:
print("First few rows of the sample file:")
display(sample_df.head())

You can preview the dataframe by viewing the first rows using .head() or the last rows using .tail, default 5.

# Exploratory Data Analysis with Pandas

## Introduction
Before diving into more complex analyses, it's essential to understand the structure and characteristics of your data. Pandas is a powerful Python library that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. Let's explore some basic Pandas functionalities to better understand our sample and population files.

Remember that we loaded the sample and population files earlier and created Pandas Dataframes called sample_df and population_df.

### Basic Information
You can get a quick overview of the DataFrame using .info().

In [None]:
# Get basic information about the sample DataFrame
sample_df.info()

You can preview the dataframe by viewing the first rows using .head() or the last rows using .tail, default 5.

In [None]:
sample_df.head()

In [None]:
sample_df.head(10)

In [None]:
sample_df.tail()

In [None]:
sample_df.tail(15)

### Summary Statistics
The .describe() method provides summary statistics of the DataFrame, useful for getting a sense of the distribution of each attribute.

In [None]:
# Get summary statistics for the sample DataFrame
sample_df.describe()

### Count Values
To count the number of occurrences of each unique value in a column, you can use .value_counts().

In [None]:
# Count the number of individuals per population
sample_df['Population name'].value_counts()

### Filtering Data
You can filter rows based on certain conditions. For example, let's filter the sample DataFrame to only include individuals from a specific population.

In [None]:
# Filter to include only individuals from the 'YRI' population
yri_population = sample_df[sample_df['Population code'] == 'YRI']
yri_population.head()

In [None]:
type(yri_population)

In [None]:
yri_population.info()

In [None]:
yri_population.describe()

In [None]:
# Search for individuals with specific attributes
specific_entries = sample_df[(sample_df['Population code'] == 'YRI') & (sample_df['Sex'] == 'female')]
specific_entries.info()

In [None]:
specific_entries.head()

In [None]:
specific_entries.describe()

### Multiple Conditions
You can include multiple conditions in your query. For example, let's find all females in either the 'YRI' or 'CEU' populations.

In [None]:
# Search for females in either 'YRI' or 'CEU' populations
multiple_conditions = sample_df[(sample_df['Population code'].isin(['YRI', 'CEU'])) & (sample_df['Sex'] == 'female')]
multiple_conditions.describe()

In [None]:
target_population = 'YRI'
target_sex = 'female'

# Search using variables
variable_filter = sample_df[(sample_df['Population code'] == target_population) & (sample_df['Sex'] == target_sex)]
variable_filter.describe()

### Searching for Entries in a List
If you have a list of values to search for, you can use the isin() method within .query().

In [None]:
# List of target populations
target_populations = ['YRI', 'CEU']

# Search for individuals in target populations
list_filter = sample_df[sample_df['Population code'].isin(target_populations)]
list_filter.describe()

### Using String Methods
You can also use string methods to search for specific patterns in string columns.

In [None]:
# Search for individuals whose sample IDs start with 'NA'
string_filter = sample_df[sample_df['Sample name'].str.startswith('NA')]
print(len(string_filter))
string_filter.head()

## Exploratory Data Analysis

Before we move on, let's specifically look at:

1. The total number of samples in the dataset.
2. The distribution of samples by sex.
3. The distribution of samples by population.
4. The distribution of samples by superpopulation.

In [None]:
# Check if the sample DataFrame is loaded
if 'sample_df' in locals():

    # Total number of samples
    total_samples = len(sample_df)
    print(f"Total number of samples: {total_samples}")

    # Distribution by Sex
    print("\nDistribution of samples by Sex:")
    display(sample_df['Sex'].value_counts())

    # Distribution by Superpopulation
    print("\nDistribution of samples by Superpopulation:")
    display(sample_df['Superpopulation name'].value_counts())

else:
    print("The sample DataFrame is not loaded. Please make sure to load the sample file.")

## Downloading 1000 Genomes Data

Up to this point, we have explored the sample and population metadata files to understand the structure of our data. However, we have not yet downloaded the actual genetic data from the 1000 Genomes Project.

(Optional: Check out the page where we will download the files: https://www.internationalgenome.org/data-portal/data-collection/30x-grch38. Look in the description text for "Phased VCFs". This will take you to the ftp page where we get our links for our code. You can also manually download the files from here.)

In the next step, we will download the VCF (Variant Call Format) files for chromosome 21.

Note: The script will also check if the directory for storing the 1000 Genomes reference panel exists. If not, it will create one for you.

Let's proceed to download the data for chromosome 21.

#### Download 1000 Genomes Chromosome 21, if you haven't done so already.
file size: 407 M

In [None]:
%%bash -s "$primary_directory" "$references_directory" "$results_directory"

# Receive directory variables from Python
primary_dir=$1
ref_dir=$2
results_dir=$3

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${ref_dir}/onekg_reference_panel"

# Check if the onekg_reference_panel directory exists; if not, create it
if [ ! -d "${onekg_reference_panel_dir}" ]; then
    echo "Creating onekg_reference_panel directory..."
    mkdir -p "${onekg_reference_panel_dir}"
fi

echo

# Download the VCF file for chromosome 21 from the 1000 Genomes FTP site
echo "Downloading chromosome 21..."
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr21.filtered.SNV_INDEL_SV_phased_panel.vcf.gz \
    -P ${onekg_reference_panel_dir}

#### **Optional**: Run the folloiwng cell to download 1000 Genomes for all chromosomes.

In [None]:
%%bash -s "$primary_directory" "$references_directory" "$results_directory"

# Receive directory variables from Python
primary_dir=$1
ref_dir=$2
results_dir=$3

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${ref_dir}/onekg_reference_panel"

# Check if the onekg_reference_panel directory exists; if not, create it
if [ ! -d "${onekg_reference_panel_dir}" ]; then
    echo "Creating onekg_reference_panel directory..."
    mkdir -p "${onekg_reference_panel_dir}"
fi

echo

# Loop through each chromosome to download data
for chromosome in {1..22}
do
    # Download the VCF file for each chromosome from the 1000 Genomes FTP site
    echo "Downloading chromosome ${chromosome}..."
    wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz \
        -P ${onekg_reference_panel_dir}

    # Index the downloaded VCF file
    echo "Indexing chromosome ${chromosome}..."
    tabix -p vcf "${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz"

done

## Subsetting our data

### Why Is It Necessary?

As we dive deeper into genetic data analysis, it's important to manage computational resources effectively. Whole-genome VCF files from projects like the 1000 Genomes can be extremely large, often containing data for thousands of individuals across millions of genetic variants. Loading such extensive data into memory can be computationally intensive and may even cause the kernel to crash, as we've experienced.

### Advantages of Subsetting

1. **Reduced Computational Load**: By focusing on a subset of 500 individuals, we significantly reduce the computational resources needed for the analysis.

2. **Faster Execution**: Smaller datasets mean that code will execute more quickly, allowing us to focus on the analysis rather than waiting for code to run.

3. **Feasibility**: Not all personal computers will have the resources to handle large genomic datasets. Subsetting makes the tutorial more accessible.

4. **Focused Analysis**: With fewer individuals, it's easier to explore the data in depth, which is particularly useful for educational purposes.

In [None]:
population_df

## Subsetting based on population

In [None]:
%%bash -s "$primary_directory" "$references_directory" "$results_directory"

# Receive directory variables from Python
primary_directory=$1
references_directory=$2
results_directory=$3

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir=${references_directory}/onekg_reference_panel

# Define the path to the metadata file
onekg_metadata_file=${references_directory}/samples_igsr_1000genomes_grch38.tsv

# Define the populations to subset
populations=("ASW" "ACB")

# Loop for specificed chromosome(s)
for chromosome in 21; do
    # Define the input VCF file path
    input_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz

    # Define the output VCF file path
    output_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz

    # Create a temporary file to store the sample names
    sample_file=${onekg_reference_panel_dir}/samples_${populations[*]}.txt

    # Extract sample names for the specified populations from the metadata file
    grep -E "$(IFS='|'; echo "${populations[*]}")" "${onekg_metadata_file}" | cut -f 1 > "${sample_file}"

    # Subset the VCF file for the specified populations
    bcftools view -S "${sample_file}" -Oz -o "${output_vcf}" "${input_vcf}"

    # Index the subsetted VCF file
    tabix -p vcf "${output_vcf}"

done

### **Optional:** Subset on selected populations for autosomal chromosomes.

In [None]:
%%bash -s "$primary_directory" "$references_directory" "$results_directory"

# Receive directory variables from Python
primary_directory=$1
references_directory=$2
results_directory=$3

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir=${references_directory}/onekg_reference_panel

# Define the path to the metadata file
onekg_metadata_file=${references_directory}/samples_igsr_1000genomes_grch38.tsv

# Define the populations to subset
populations=("ASW" "ACB")

# Loop through each chromosome
for chromosome in {1..22}; do
    # Define the input VCF file path
    input_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz
    echo $input_vcf

    # Define the output VCF file path
    output_vcf=${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz
    echo $output_vcf

    # Create a temporary file to store the sample names
    sample_file=${onekg_reference_panel_dir}/samples_${populations[*]}.txt
    echo $sample_file

    # Extract sample names for the specified populations from the metadata file
    grep -E "$(IFS='|'; echo "${populations[*]}")" "${onekg_metadata_file}" | cut -f 1 > "${sample_file}"

    # Subset the VCF file for the specified populations
    bcftools view -S "${sample_file}" -Oz -o "${output_vcf}" "${input_vcf}"

    # Index the subsetted VCF file
    tabix -p vcf "${output_vcf}"

    # Remove the temporary sample file
    rm "${sample_file}"
done

# Note for instructor: fix so that it creates one single sample file, then subsets autosomes on that single sample file

## Subsetting to 500 Individuals

### Random Selection

In the next cell, we will use `bcftools` to subset our VCF file to include only these 500 randomly selected individuals. This will make subsequent analyses more manageable and less resource-intensive.

In [None]:
# Instructor: add a cell to count the samples within the populations

In [None]:
%%bash -s "$references_directory"
# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

# Define the populations
populations=("ASW" "ACB")

# Loop for specificed chromosome(s)
for chromosome in 21; do
    # Define the input VCF file path
    input_vcf="${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz"

    # Extract sample names from the subsetted VCF file
    bcftools query -l "${input_vcf}" > "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}_${populations[*]}.txt"

    # Randomly select 500 sample names
    shuf -n 500 "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}_${populations[*]}.txt" > "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}_${populations[*]}.txt"

    # Subset the VCF file based on the random 500 sample IDs
    bcftools view -S "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}_${populations[*]}.txt" -Oz -o "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_${populations[*]}.vcf.gz" "${input_vcf}"

    # Index the subsetted VCF file
    tabix -p vcf "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_${populations[*]}.vcf.gz"
done

### **Optional:** Subset on samples for autosomal chromosomes.

In [None]:
%%bash -s "$references_directory"
# Receive directory variables from Python
references_directory=$1

# Define the directory for the 1000 Genomes reference panel
onekg_reference_panel_dir="${references_directory}/onekg_reference_panel"

# Define the populations
populations=("ASW" "ACB")

# Loop through each chromosome
for chromosome in {1..22}; do
    # Define the input VCF file path
    input_vcf="${onekg_reference_panel_dir}/1kGP_high_coverage_Illumina.chr${chromosome}.filtered.SNV_INDEL_SV_phased_panel.${populations[*]}.vcf.gz"

    # Extract sample names from the subsetted VCF file
    bcftools query -l "${input_vcf}" > "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}_${populations[*]}.txt"

    # Randomly select 500 sample names
    shuf -n 500 "${onekg_reference_panel_dir}/sample_ids_chr${chromosome}_${populations[*]}.txt" > "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}_${populations[*]}.txt"

    # Subset the VCF file based on the random 500 sample IDs
    bcftools view -S "${onekg_reference_panel_dir}/random_500_sample_ids_chr${chromosome}_${populations[*]}.txt" -Oz -o "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_${populations[*]}.vcf.gz" "${input_vcf}"

    # Index the subsetted VCF file
    tabix -p vcf "${onekg_reference_panel_dir}/subset_500_chr${chromosome}_${populations[*]}.vcf.gz"
done

## Exploring the VCF Files

Now that we have downloaded the VCF files and selected a subset of 500 indiviuals, it's time to explore the VCF files. VCF (Variant Call Format) files contain information about genetic variants found in the samples.

In this section, we will:
1. Load a VCF file for a specific chromosome.
2. Explore the structure of the VCF file.
3. Examine some basic statistics.

Let's get started!

### Installing Required Packages

In data analysis and scientific computing, we often rely on specialized packages to perform specific tasks. These packages are collections of functions and methods that allow us to perform operations without having to write code from scratch.

Before we can use these packages, we need to install them. This is usually a one-time operation. Below, we will install the `scikit-allel` package, which provides tools for bioinformatics and genomics, particularly in the domain of high-throughput sequencing.

Let's proceed to install the package.

In [None]:
# https://scikit-allel.readthedocs.io/en/stable/
!sudo apt-get update
!sudo apt-get install build-essential
!pip install scikit-allel

### Loading a VCF File

We will start by loading a VCF file for a specific chromosome. For demonstration purposes, let's focus on chromosome 1.

In [None]:
import allel  # Importing the scikit-allel package
# https://scikit-allel.readthedocs.io/en/stable/index.html

print("Ignore the 'UserWarning: invalid INFO header' warning.\n")

# Define the path to the subsetted VCF file
# NOTE for instructor: change to use one populations variable across Jupyter Notebook.
vcf_path_chromosome_21_subset = os.path.join(references_directory, "onekg_reference_panel", "subset_500_chr21_ASW ACB.vcf.gz")

# Check if the subsetted VCF file exists
if os.path.exists(vcf_path_chromosome_21_subset):
    # Load the subsetted VCF file
    callset = allel.read_vcf(vcf_path_chromosome_21_subset)

    # Display the keys to understand the structure
    print("\n\nKeys in the VCF file:", list(callset.keys()))
else:
    print(f"The subsetted VCF file does not exist at {vcf_path_chromosome_21_subset}. Please make sure the file is in the correct location.")

# NOTE: add instructions for if it crash here.

print("Notice the output are the keys in the VCF file. We will use the second key, 'calldata/GT', 'variants/ALT', and 'variants/REF'")


## Understanding Allele Frequencies and SNPs

Before we proceed to subset our data, it's essential to understand some key genetic concepts: Allele Frequencies and Single Nucleotide Polymorphisms (SNPs).

### Allele Frequencies

Allele frequencies describe how often a particular variant (allele) of a gene appears within a given population. It is usually expressed as a proportion or percentage. Understanding allele frequencies is crucial for studying genetic diversity and for identifying alleles that may be associated with specific traits or diseases.

### Single Nucleotide Polymorphisms (SNPs)

SNPs are variations at a single position in a DNA sequence among individuals. They are the most common type of genetic variation and serve as markers for locating genes associated with diseases or specific traits.

### What Makes a SNP a SNP?
A Single Nucleotide Polymorphism occurs when a single nucleotide (A, T, C, or G) in the genome sequence is altered. For a variation to be considered a SNP, it must occur in at least 1% of the population. This distinguishes SNPs from random mutations, which are rare and may occur in any individual. SNPs can be synonymous (do not change the protein sequence) or non-synonymous (change the protein sequence), and they can occur in coding or non-coding regions of the genome.

### Assumptions in SNP Analysis
In SNP analysis, it is generally assumed that the nucleotide sequences between the SNPs are conserved, or identical, across the individuals being studied. This assumption allows us to focus on the SNPs as markers of genetic variation without having to analyze the entire genome.

This is a reasonable assumption because:

Most of the human genome is highly conserved.
SNPs are the most common form of genetic variation, making them effective markers for genetic diversity.
By focusing on SNPs, we can efficiently study genetic variation and its implications for traits, diseases, and population history.

### Relatedness and Shared SNPs

#### What Does It Mean If Two Individuals Share the Same SNP?

Sharing a single SNP between two individuals doesn't necessarily imply a close genetic relationship, as SNPs can be quite common in populations. However, sharing the same SNPs over a specific length of DNA sequence can be a strong indicator of relatedness. **The length of the DNA sequence where the SNPs are shared is crucial.** A longer stretch of shared SNPs increases the likelihood that the two individuals are closely related. This is often measured in centimorgans (cM), a unit that describes the genetic distance between positions on a chromosome.

#### Implications for Relatedness

1. **Close Relatives**: Close relatives like siblings or parent-child pairs will share long stretches of SNPs.
  
2. **Distant Relatives**: More distant relatives like cousins may share shorter stretches but still longer than what would be expected by random chance.

3. **Very Distantly Related Individuals**: In individuals who share a distant common ancestry, any shared SNPs are likely to be short, often less than 1 centimorgan (cM). These short stretches are scattered randomly across the genome. While they are not indicative of a close or recent familial relationship, they do reflect shared ancestors from a distant past.

By analyzing the length and distribution of shared SNPs, researchers can infer the degree of relatedness between individuals, which is valuable in studies including Migration Patterns, Adaptation Studies, Archaeogenetics, Forensic Anthropology, and Evolutionary Anthropology.

### Identifying SNPs

1. **Total SNPs**: The total number of SNPs identified in the dataset gives us an overview of the genetic variation present.
  
2. **SNPs by Superpopulation**: Breaking down the SNPs by superpopulation allows us to understand how genetic variations are distributed among different groups. This is crucial for studies that aim to understand the genetic basis of population-specific traits or susceptibilities.

In [None]:
import numpy as np

# callset variable was created in the previous code cell.
gt = allel.GenotypeArray(callset['calldata/GT'])

# Get the reference and alternate alleles from the VCF file.
ref_alleles = callset['variants/REF']
alt_alleles = callset['variants/ALT'][:, 0] #  to extract the first alternate allele for each variant.
# This slice operation is taking all rows (:) and the first column (0) of the alt_alleles array

# Create a boolean mask for biallelic variants
is_biallelic = np.array([len(set(allele) - {'', '.'}) == 1 for allele in alt_alleles])

# Create a boolean mask for valid SNPs
valid_bases = {'A', 'T', 'C', 'G'}
is_valid_snp = np.array([ref in valid_bases and alt in valid_bases for ref, alt in zip(ref_alleles, alt_alleles)])

# Combine the two masks
final_mask = is_biallelic & is_valid_snp

# Apply the mask to the genotype array
gt_filtered = gt.compress(final_mask, axis=0)

print(f"Total number of samples: {gt_filtered.n_samples}")
print(f"Total number of variants: {gt.shape[0]:,}")
print(f"Total number of biallelic variants: {gt_filtered.n_variants:,}")

print()

### Calculate allele counts

We use our gt variable that we created to calculate the allele counts using the count_alleles() method.
This gives us the number of occurrences of each allele for each variant.


In [None]:
ac = gt_filtered.count_alleles()

max_allele_count = ac.max(axis=1)
total_allele_count = ac.sum(axis=1)

does_vary = (total_allele_count - max_allele_count) > 0
# does_vary = (ac.sum(axis=1) - ac.max(axis=1)) > 0

print("\nVariability Analysis:")
print("If a genetic variant has the same allele in all individuals, it's non-variable.")
print("The following shows whether there's variability among the first few variants:")
print("\n")
print("Allele Count:")
print("count_for_first_allele\tcount_for_second_allele")
print("reference_allele_count\talt_allele_count")
print(ac[:5]) # the first 5 variants
print(f"\nTotal number of genetic variants that vary among the {gt_filtered.n_samples} individuals ({gt_filtered.n_samples * 2} haplotypes): {np.count_nonzero(does_vary):,}")

985: This is the count of the first allele for the first genetic variant. It means that this particular allele appears 985 times across all the haplotypes or individuals you've sampled.

15: This is the count of the second allele for the same variant. It means that this second allele appears 15 times across your sampled haplotypes or individuals.

So, for this first genetic variant, you have two alleles with the counts of 985 and 15, respectively. This indicates that in your sample, the first allele is much more common than the second allele for this specific variant.

**What is ac?**

`ac` is an Allele Counts Array, where each row represents a variant (like a SNP or an indel), and each column represents an allele.

For bi-allelic SNPs, you'll typically have two columns: one for each allele (e.g., A and T).

Here's a simplified example of what ac might look like for 3 variants:

```
20  30  # Variant 1: 20 counts of allele 1, 30 counts of allele 2
40  10  # Variant 2: 40 counts of allele 1, 10 counts of allele 2
25  25  # Variant 3: 25 counts of allele 1, 25 counts of allele 2
```
**What does axis=1 mean?**

In NumPy (and many other Python libraries), an array can have multiple dimensions. The axis parameter specifies which dimension you want to perform the operation along.

`axis=0` means that the operation will be performed vertically (down the rows for each column).
`axis=1` means that the operation will be performed horizontally (across the columns for each row).

For our example ac array:

The sum for Variant 1 would be `20 + 30 = 50`

The sum for Variant 2 would be `40 + 10 = 50`

The sum for Variant 3 would be `25 + 25 = 50`

So, ac.sum(axis=1) would return `[50, 50, 50]` in this example.

**Why is this useful?**

Summing the allele counts for each variant gives you the total number of alleles observed for that variant. This is useful for various types of genetic analyses, including identifying SNPs, calculating allele frequencies, and more.

`ac.max(axis=1)`

This finds the maximum allele count for each variant. Essentially, it identifies the most common allele for each variant.

`(ac.sum(axis=1) - ac.max(axis=1)) > 0`

This expression calculates the difference between the total number of alleles and the count of the most common allele for each variant. The idea is to find out how many of the "other" alleles are present.

For example, if a variant has allele counts `[20, 30]`, the sum would be `50`, and the max would be `30`. The difference `50 - 30` would be `20`, representing the count of the less common allele.

If the value of `ac.max(axis=1)` is the same as `ac.sum(axis=1)`, it means that all observed alleles for that particular variant are the same. In other words, there is only one type of allele present for that variant.

For example, let's say a variant has allele counts `[50, 0]`. The sum would be `50`, and the max would also be `50`. The difference `50 - 50` would be `0`, and `(ac.sum(axis=1) - ac.max(axis=1)) > 0` would evaluate to `False` for this variant, indicating that it is not a SNP.

So, when the sum and the max are the same, the variant is not considered a Single Nucleotide Polymorphism (SNP) because there is no variation; all observed alleles are the same.

If the count of the less common allele is greater than zero, that means there is more than one type of allele present for that variant, making it a Single Nucleotide Polymorphism (SNP).

`np.count_nonzero(is_snp)`
Finally, this counts the number of `True` values in the `is_snp` array, giving you the total number of SNPs identified.

Putting it all together
The line `is_snp = (ac.sum(axis=1) - ac.max(axis=1)) > 0` is a way to identify SNPs by checking if there is more than one type of allele present for each variant. Then, np.count_nonzero(is_snp) counts how many SNPs have been identified.

## Minor Allele Frequency
What is Minor Allele Frequency?

In a given population, for a particular genetic variant (or SNP), the allele that occurs less frequently is termed the "minor allele." The frequency of this minor allele in the population is known as the Minor Allele Frequency (MAF). It is calculated as the count of the minor allele divided by the total number of alleles examined.

**Why is MAF Important?**

Statistical Power: Variants with extremely low MAF are often excluded because they may lack the statistical power to detect association with a trait or disease.

Quality Control: Filtering by MAF is a common quality control step to remove potential errors in variant calling.

Biological Relevance: Variants with higher MAF are more likely to be biologically relevant and less likely to be random mutations.

Calculating MAF with scikit-allel
In scikit-allel, you can calculate the MAF as follows:

In [None]:
print(f"Of the {gt_filtered.shape[0]:,} variants in chromosome 21, we determine that {np.count_nonzero(does_vary):,} vary, but are not necessarly SNPs.")

In [None]:
# Calculate Minor Allele Frequency (MAF)

# max_allele_count = ac.max(axis=1)
# total_allele_count = ac.sum(axis=1)

minor_allele_count = total_allele_count - max_allele_count

maf = (total_allele_count - max_allele_count) / total_allele_count


# Here, gt.compress(maf > 0.05, axis=0) will keep only the rows (variants) in the genotype array where the MAF is greater than 5%.
# A set of SNPs with a MAF greater than 5%
gt_maf_filtered_5 = gt_filtered.compress(maf > 0.05, axis=0)
print(f"Total number of common SNPs accross the whole dataset: {gt_maf_filtered_5.shape[0]:,}")

### Summary
Filtering by MAF is a crucial step in many genetic analyses. It helps in focusing on variants that are likely to be meaningful, thereby increasing the robustness and reliability of your results.