Make sure your kernel (virtual environment) is selected.

Run the download_fasta_file.ipynb notebook.

# Environment Setup

In [None]:
import os
import importlib.util

In [None]:
primary_directory = '/content'
primary_directory

In [None]:
if primary_directory.startswith('/content'):

  from google.colab import drive

  drive.mount('/content/drive')

In [None]:
# Define directories
if primary_directory.startswith('/content'):
    use_directory = "/content/use"
else:
    use_directory = os.path.join(primary_directory, "use")

results_directory = os.path.join(primary_directory, "results")
references_directory = os.path.join(primary_directory, "references")
data_directory = os.path.join(primary_directory, "data")

# Directories to check
directories = [use_directory, results_directory, references_directory, data_directory]

# Check if the directories exist and print a message
for directory in directories:
    if os.path.exists(directory):
        print(f"Directory exists: {directory}")
    else:
        os.makedirs(directory)
        print(f"Directory created: {directory}")

In [None]:
# Function to check if a package is installed
def check_install_package(package_name, pip_name=None):
    package_spec = importlib.util.find_spec(package_name)
    if package_spec is None:
        print(f"{package_name} not found. Installing...")
        !pip install {pip_name if pip_name else package_name}
    else:
        print(f"{package_name} is already installed.")

check_install_package('pandas', 'pandas')

# Get Fasta file

In [None]:
%%bash -s "$references_directory"

references_directory=$1

# Create the fasta directory if it does not exist
mkdir -p ${references_directory}/fasta/

# Fasta reference file for GRCh38 from Ensembl
# Fasta reference file for GRCh38
# from Ensembl at https://www.ensembl.org/
# download the fasta reference file for GRCh38, if needed
fasta_url="http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
fasta_file="${references_directory}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

# Check if the fasta file already exists
if [ ! -f "$fasta_file" ]; then
    echo "FASTA file does not exist. Downloading..."
    wget -O "${fasta_file}.gz" $fasta_url && gunzip "${fasta_file}.gz"
    if [ $? -ne 0 ]; then
        echo "Download or extraction failed."
    else
        echo "Download and extraction completed successfully."
    fi
else
    echo "FASTA file already exists."
fi

# 25m 3.2s

# Lab

In [None]:
import os # Imports Python's built-in os library, which allows us to interact with the operating system.
import glob
import shutil
import csv
import zipfile

In [None]:
output_dir = f"{results_directory}/lineage_output"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
    print(f"Directory {output_dir} deleted.")
os.makedirs(output_dir)
print(f"Directory {output_dir} created.")

In [None]:
opensnp_data_directory = os.path.join(data_directory, "opensnp_data")

# Check if the OpenSNP data directory exists
if not os.path.exists(opensnp_data_directory):
    # Create the OpenSNP data directory if it doesn't exist
    os.makedirs(opensnp_data_directory)
    print(f"Created directory: {opensnp_data_directory}")
else:
    print(f"Directory already exists: {opensnp_data_directory}")

Manually upload the lab 2 data zip file in your opensnp data directory before running the next cell.

In [None]:
# Specify the path to the zip file
lab3_data_zip = os.path.join(opensnp_data_directory, "lab3_data.zip")

# Open the zip file
with zipfile.ZipFile(lab3_data_zip, "r") as zip_ref:
    # Extract all the contents of the zip file to the specified directory
    zip_ref.extractall(opensnp_data_directory)

# Delete the zip file
os.remove(lab3_data_zip)

print("Zip file extracted and deleted successfully.")

In [None]:
file_pattern = os.path.join(opensnp_data_directory, "*.ancestry.txt")
file_pattern

In [None]:
opensnp_files = glob.glob(file_pattern)
opensnp_files

opensnp_files is a list. Notice that the output above is enclosed in square brackets. This let's you know that it is a list.

In [None]:
opensnp_files[0]

In [None]:
opensnp_files[1]

In [None]:
opensnp_files[-1]

In [None]:
opensnp_files[-2]

In [None]:
total_files = len(glob.glob(file_pattern))
total_files

In [None]:
for file_path in opensnp_files:
  print("This is the full path:")
  print(file_path)
  filename = os.path.basename(file_path)
  print("This is the file name:")
  print(filename)
  print("\n")

Print the cell below and notice the output for `print(count)`.

In [None]:
count = 0
for file_path in opensnp_files:
  # count =
  print(count)
  print("This is the full path:")
  print(file_path)
  filename = os.path.basename(file_path)
  print("This is the file name:")
  print(filename)
  print("\n")

Correct the cell above so that the counter works. Think about what you need to happen to the count value.

In [None]:
count = 0
for file_path in opensnp_files:
  count = count + 1
  print(count)
  print("This is the full path:")
  print(file_path)
  filename = os.path.basename(file_path)
  print("This is the file name:")
  print(filename)
  print("\n")

In [None]:
count = 0
for file_path in opensnp_files:
  count = count + 1
  print(count)
  print("This is the full path:")
  print(file_path)
  filename = os.path.basename(file_path)
  print("This is the file name:")
  print(filename)
  file_bits = filename.split("_")
  print("These are the elements in the filename list:")
  print(file_bits)
  username = file_bits[0]
  print("We can use this to isolate the username.")
  print(username)
  print("\n")

In [None]:
count = 0
len_opensnp_files = len(opensnp_files)
for file_path in opensnp_files:
    count = count + 1
    filename = os.path.basename(file_path)
    username = filename.split("_")[0]

    print(f"Processing file {count} in {len_opensnp_files}: {username}")

In [None]:
!pip install lineage

In [None]:
from lineage import Lineage

# https://snps.readthedocs.io/en/stable/
# https://lineage.readthedocs.io/en/stable/

# initialize Lineage object
l = Lineage(
    output_dir = output_dir,
    resources_dir = f"{references_directory}",
    parallelize = True,
    processes = 8
)

# initialize dictionary variables
individuals_dict = {}
sex_determination = {}
# initialize count variable
count = 0

directory_path = os.path.join(data_directory, "opensnp_data")
file_pattern = os.path.join(directory_path, "*.ancestry.txt")
opensnp_files = glob.glob(file_pattern)
len_opensnp_files = len(opensnp_files)

# Path for the sex determination TSV file
sex_determination_file = os.path.join(results_directory, "opensnp_sex_determination.tsv")

# Create a lineage individual object for each Ancestry file
# Loop through file names and create individuals_dict
for file_path in opensnp_files:
    count = count + 1
    filename = os.path.basename(file_path)
    username = filename.split("_")[0]

    print(f"Processing file {count} in {len_opensnp_files}: {username}")

    # print(username)
    # assign_par_snps (bool) – assign PAR SNPs to the X and Y chromosomes
    # with = True, error message: Chromosome PAR not remapped; removing chromosome from SNPs for consistency
    # deduplicate_MT_chrom (bool) – deduplicate alleles on MT; see SNPs.heterozygous_MT
    # deduplicate_XY_chrom (bool or str) – deduplicate alleles in the non-PAR regions of X and Y for males
    # Why message: Chromosome PAR not remapped; removing chromosome from SNPs for consistency
    individuals_dict[username] = l.create_individual(username,
                                                     file=file_path,
                                                     assign_par_snps=True,
                                                     deduplicate_MT_chrom=True,
                                                     deduplicate_XY_chrom=True)

    if individuals_dict[username].build != 38:
        individuals_dict[username].remap(38)

    individuals_dict[username].sort()
    individuals_dict[username].to_tsv(os.path.join(output_dir, f"{username}.tsv"))

    # Determine sex
    # heterozygous_x_snps_threshold (float) – percentage heterozygous X SNPs; above this threshold, Female is determined
    # y_snps_not_null_threshold (float) – percentage Y SNPs that are not null; above this threshold, Male is determined
    # chrom ({“X”, “Y”}) – use X or Y chromosome SNPs to determine sex
    # Returns ‘Male’ or ‘Female’ if detected, else empty str
    sex_determination[username] = individuals_dict[username].determine_sex(
        heterozygous_x_snps_threshold=0.03,
        y_snps_not_null_threshold=0.3,
        chrom='X'
        )
    # print(sex_determination[username])

# Save sex determinations to TSV
with open(sex_determination_file, 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(['Username', 'Sex'])
    for username, sex in sex_determination.items():
        writer.writerow([username, sex])

print("All files processed.")

You can ignore the "Chromosome PAR not remapped; removing chromosome from SNPs for consistency" note for now.

## Install bcftools and htslib

In [None]:
%%bash -s "$primary_directory" "$use_directory"

primary_directory=$1
use_directory=$2

# Install dependencies
sudo apt-get update
sudo apt-get install -y tabix
sudo apt-get install -y bcftools

cd $use_directory
git clone --recurse-submodules https://github.com/samtools/htslib.git
git clone https://github.com/samtools/bcftools.git

cd ${use_directory}/bcftools
make
export BCFTOOLS_PLUGINS=${use_directory}/bcftools/plugins

cd $primary_directory

## Install gawk

In [None]:
!sudo apt-get update
!sudo apt-get install gawk

In [None]:
%%bash -s "$results_directory" "$references_directory"

results_directory=$1
references_directory=$2

# Create the merged sample VCF file

# First, convert the files from AncestryDNA tsv format to the 23andMe tsv format
# Then, bcftools converts the data file from 23andMe tsv to vcf
# Then, each vcf file is indexed
# Finally, get a list of the individual vcf files and merge them into a single vcf file
# The bcftools stats are from the final MergedSample.vcf file

for file in ${results_directory}/lineage_output/*.tsv
do
    echo "converting to vcf.gz: " $file
    # Create a new file with the modified format
    new_file="${file%.tsv}_modified.tsv"
    gawk -F'\t' '{ print $1"\t"$2"\t"$3"\t"$4$5; }' $file > $new_file
    bcftools convert -c ID,CHROM,POS,AA -s $(basename $file .tsv) \
            --haploid2diploid \
            -f ${references_directory}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
            --tsv2vcf $new_file \
            -Oz -o $(dirname $file)/$(basename $file .tsv).vcf.gz

    echo "indexing vcf file" $(dirname $file)/$(basename $file .tsv).vcf.gz
    bcftools index $(dirname $file)/$(basename $file .tsv).vcf.gz
done

### Understanding the output

```
Rows total: 	668742
Rows skipped: 	7514
Sites written: 	661228
Missing GTs: 	12176
Hom RR: 	349401
Het RA: 	177931
Hom AA: 	121589
Het AA: 	131
```



`Rows total: 668742`

Total number of variant records processed in the TSV file.

`Rows skipped: 7514`

The number of records that were skipped, potentially due to formatting issues or not meeting certain criteria for conversion.

`Sites written: 661228`

Number of variant sites successfully converted and written to the VCF file.

`Missing GTs: 12176`

GTs stands for Genotypes. This count indicates that there were 12,176 instances where the genotype information was missing.

<br>
In the following, `R` stands for reference and `A` stands for alternative.

`Hom RR: 349401`

Number of homozygous reference genotypes. In these cases, both alleles at a particular site match the reference genome.

`Het RA: 177931`

Number of heterozygous genotypes where one allele is the reference allele and the other is an alternative allele.

`Hom AA: 121589`

Number of homozygous alternative genotypes. Here, both alleles are the alternative variant, differing from the reference genome.

`Het AA: 131`

The count of heterozygous genotypes where both alleles are different alternative alleles (neither matches the reference).

## Create a VCF file

In [None]:
%%bash -s "$results_directory" "$references_directory"

results_directory=$1
references_directory=$2

find ${results_directory}/lineage_output -type f -name "*.vcf.gz" > ${results_directory}/file_list.txt
bcftools merge -o ${results_directory}/MergedSamples.vcf --file-list ${results_directory}/file_list.txt
bcftools stats -s - ${results_directory}/MergedSamples.vcf > ${results_directory}/MergedSamples_step0_stats.vchk
rm ${results_directory}/file_list.txt

In [None]:
# very basic parser for the MergedSamples stats file

# Path to the stats file
file_path = os.path.join(results_directory, "MergedSamples_step0_stats.vchk")

# Function to parse the file
def parse_summary_numbers(file_path):
    sn_data = {}
    capture = False
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('SN'):
                print(line)

    return

# Call the function and process the results
stats_data = parse_summary_numbers(file_path)

## Quality Controls

In [None]:
%%bash -s "$use_directory"

# Define the directory and file URL
use_directory=$1
echo $use_directory

plink2_file="https://s3.amazonaws.com/plink2-assets/alpha5/plink2_linux_x86_64_20240105.zip"
plink2_zip=$(basename "$plink2_file")

# Check if the plink2 file already exists
if [ ! -f "${use_directory}/plink2" ]; then
    echo "Downloading plink2..."
    # Download plink2
    wget ${plink2_file} -P ${use_directory}
    # Unzip the downloaded file
    (cd "${use_directory}" && unzip "${plink2_zip}" && rm "${plink2_zip}")
fi


In [None]:
%%bash -s "$results_directory" "$references_directory" "$use_directory"

results_directory=$1
references_directory=$2
use_directory=$3
sample_file=MergedSamples

# Quality Control

# Perform qualty control on the MergedSample.vcf file using plink2

# Quality control:
# --autosome keep only autosomal SNPs
# --rm-dup remove duplicate SNPs, keeping the first occurance
# --vcf-half-call treat half-calls as missing
# --snps-only outside of {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', <missing code>} are excluded
# --min-alleles 2 and --max-alleles 2 keep SNPs where there are only 2 alleles
# Split the MergedSamples_qc.vcf file by chromosome



cd ${use_directory}

./plink2 \
  --vcf ${results_directory}/${sample_file}.vcf \
  --autosome \
  --snps-only 'just-acgt' \
  --make-pgen \
  --out ${results_directory}/${sample_file}_step1

# These comments are from a different run. Shown here for illustrative purposes only.
# started with 816509 variants
# --vcf: 783106 variants scanned (33403 skipped).
# --vcf: 720k variants converted.
# 783102 out of 783106 variants loaded
# 783102 variants remaining after main filters.
###################################################

cd ${use_directory}

./plink2 \
  --pfile ${results_directory}/${sample_file}_step1 \
  --rm-dup force-first \
  --min-alleles 2 \
  --max-alleles 2 \
  --make-pgen \
  --out ${results_directory}/${sample_file}_step2

# These comments are from a different run. Shown here for illustrative purposes only.
# 616190 out of 783102 variants loaded
# --rm-dup: 5 duplicated IDs, 5 variants removed.
# 616185 variants remaining after main filters.
# #####################################################

cd ${use_directory}

./plink2 \
  --pfile ${results_directory}/${sample_file}_step2 \
  --geno 0.01 \
  --maf 0.01 \
  --sort-vars \
  --make-pgen \
  --out ${results_directory}/${sample_file}_step3

# --mind 0.01 for sample missingness
# #############################################################

In [None]:
%%bash -s "$results_directory" "$references_directory" "$use_directory"

# split by chromosome

results_directory=$1
references_directory=$2
use_directory=$3
sample_file=MergedSamples

for chromosome in {1..22}
do
  cd ${use_directory}

  ./plink2 \
    --pfile ${results_directory}/${sample_file}_step3 \
    --chr ${chromosome} \
    --export vcf \
    --out ${results_directory}/${sample_file}_qcfinished_chr${chromosome}

  bgzip ${results_directory}/${sample_file}_qcfinished_chr${chromosome}.vcf
  bcftools index -f ${results_directory}/${sample_file}_qcfinished_chr${chromosome}.vcf.gz
done

## Zip and download your results directory

If you're using Google Colab, you'll need to download your results directory so that you can use it in subsequent labs. Run the following cell to create a zip file of your results directory. You can then download the zip file.

In [None]:
import os
import zipfile

def zip_directory(directory_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file),
                                           os.path.join(directory_path, '..')))

# Specify the directory to be zipped
directory_to_zip = "/content/results"

# Specify the path where the zip file will be saved
zip_file_path = "/content/results.zip"

if use_directory == '/content/use':
  # Create the zip file
  zip_directory(directory_to_zip, zip_file_path)