## ACKR1 gene: macaque

Data from the The Macaque Genotype and Phenotype Resource (mGAP) at https://mgap.ohsu.edu/.

Variant catalog 1.3.


In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from selectiontest import selectiontest
import pathlib
import pysam
from vcf import Reader        # https://pypi.org/project/PyVCF/
from Bio import SeqIO
import gzip, pickle

Use ACKR1 coordinates from mGAP

In [2]:
def cleanvcf(vcf_file,coord, start, end, select_chr=True):
    snps = vcf_file.fetch(str(coord), start, end)
    bad_samples = list()
    for record in snps:
        allele_count = 0
        if record.is_snp:
            for sample in record.samples:
                if sample.gt_bases is None:
                    bad_samples.append(sample.sample) 
    bad_samples = list(set(bad_samples))
    return bad_samples
    
    
def vcf2sfs3(vcf_file, n, coord, start, end, bad_samples, select_chr=True):
    """
    Get SFS from vcf data for given population and sequence. The panel file is used to select samples.

    Parameters
    ----------
    vcf_file: pyvcf class: Reader (https://pyvcf.readthedocs.io/en/latest/)
        Variant details

    coord: str
        Coordinate (e.g. chromosome).

    start: int
        Start position of sequence.

    end: int
        End position of sequence.
        
    bad_samples: list
        List of samples with missing data

    select_chr: bool
        If True, sample first chromosome. If False, use both.

    Returns
    -------
    list
        Site frequency spectrum

    int
        Sample size

    list
        Names of variants common to all elements of the sample.

    """
    n = n - len(bad_samples)
    if not select_chr:
        n = 2 * n
    snps = vcf_file.fetch(str(coord), start, end)
    count = 0
    allele_counts = list()
    non_seg_snps = list()
    for record in snps:
        allele_count = 0
        if record.is_snp:
            count += 1
            for sample in record.samples:
                if sample.sample not in bad_samples:
                    gt = sample.gt_alleles
                    if select_chr:
                        allele_count += int(gt[0])
                    else:
                        allele_count += int(gt[0]) + int(gt[1])
            if 0 < allele_count < n:    #Some SNPs may not segregate in some subpopulations.
                allele_counts.append(allele_count)
            else:
                non_seg_snps.append(record.POS)
    sfs_c = Counter(allele_counts)
    sfs = np.zeros(n - 1, int)
    for i in sfs_c.keys():
        sfs[i - 1] = sfs_c[i]
    print('Total SNPs processed           = ', count)
    return sfs, n, non_seg_snps

In [3]:
chrom = 'chr01'
ACKR1_start = 133909860  # Use ACKR1 coordinates from mGAP
ACKR1_end =   133912994
vcf_filename = '/Users/helmutsimon/Data sets/mGAP_macaque/mGap.v1.3.vcf.gz'
fname = pathlib.Path(vcf_filename)
assert fname.exists(), f'No such file: {fname}'  # check that the file exists
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
nsam = len(vcf_file.samples)
print('Total samples                  = ', nsam)
bad_samples = cleanvcf(vcf_file, chrom, ACKR1_start, ACKR1_end)
print('No. bad samples                = ', len(bad_samples))
sfs, n, non_seg_snps = vcf2sfs3(vcf_file, nsam, chrom, ACKR1_start, ACKR1_end, bad_samples)
print('SFS: ', sfs)
print('Number of segregating sites    = ', sum(sfs))
print('Sample size                    = ', n)
print('SNPs not segregating in sample = ', len(non_seg_snps))
rho = selectiontest.test_neutrality(sfs, reps=1000000)
print('Rho                            = ', rho)
print("Tajima's D                     = ", selectiontest.calculate_D(sfs))

Total samples                  =  213
No. bad samples                =  120
Total SNPs processed           =  44
SFS:  [2 7 3 3 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Number of segregating sites    =  20
Sample size                    =  93
SNPs not segregating in sample =  24
Rho                            =  -0.26223129593716443
Tajima's D                     =  -1.0479684354789265
