## ACKR1_gene_chimpanzee

Data obtained from http://biologiaevolutiva.org/greatape

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
from collections import Counter
from selectiontest import selectiontest
import pysam
from vcf import Reader        # https://pypi.org/project/PyVCF/
from Bio import SeqIO
from selectiontest import selectiontest
import gzip, pickle

Set hg18 (NCBI36 Ensembl release 54) coordinates for gene ACKR1 from http://may2009.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000213088

In [2]:
chrom = 'chr1'
ACKR1_start_hg18 = 157439721
ACKR1_end_hg18 =   157442914 

Calculate SFS, ignoring variants with missing data

In [3]:
def vcf2sfs2(vcf_file, panel, coord, start, end, select_chr=True):
    n = panel.shape[0] 
    if not select_chr:
        n = 2 * n
    snps = vcf_file.fetch(str(coord), start, end)
    count, valid_snp_count = 0, 0
    allele_counts = list()
    non_seg_snps = list()
    for record in snps:
        allele_count = 0
        if record.is_snp:
            count += 1
            missing_data = False
            for sample in record.samples:
                if sample.sample in panel.index:
                    if sample.gt_bases is None:
                        missing_data = True
                    else:
                        gt = sample.gt_alleles
                        if select_chr:
                            allele_count += int(gt[0])
                        else:
                            allele_count += int(gt[0]) + int(gt[1])
            if missing_data:
                continue
            valid_snp_count += 1    
            if 0 < allele_count < n:    #Some SNPs may not segregate in some subpopulations.
                allele_counts.append(allele_count)
            else:
                non_seg_snps.append(record.POS)
    sfs_c = Counter(allele_counts)
    del sfs_c[0]
    sfs = np.zeros(n - 1, int)
    for i in sfs_c.keys():
        sfs[i - 1] = sfs_c[i]
    print('Total SNPs processed        = ', count)
    print('SNPs without missing data   = ', valid_snp_count)
    return sfs, n, non_seg_snps, count, valid_snp_count


vcf_filename = '/Users/helmutsimon/OneDrive - Australian National University/Data sets/Pan_troglodytes/Pan_troglodytes.vcf.gz'
fname = pathlib.Path(vcf_filename)
assert fname.exists(), f'No such file: {fname}'  # check that the file exists
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
panel_name = '/Users/helmutsimon/OneDrive - Australian National University/Data sets/Pan_troglodytes/Pan_trog_panel.csv'
panel = pd.read_csv(panel_name, sep=',', index_col=0)
sfs, n, non_seg_snps, count, valid_snp_count = vcf2sfs2(vcf_file, panel, chrom, ACKR1_start_hg18, ACKR1_end_hg18)

print('SFS ', sfs)
print('Number of segregating sites = ', sum(sfs))
print('Sample size                 = ', n)
print('SNPs not segregating in sample = ', len(non_seg_snps))

Total SNPs processed        =  61
SNPs without missing data   =  46
SFS  [0 3 0 4 0 4 2 0 1 0 0 0 0 0 1 1 0 0 0 3 0 0 0 0]
Number of segregating sites =  19
Sample size                 =  25
SNPs not segregating in sample =  27


Calculate rho and Tajima's D

In [4]:
print('Rho = ', selectiontest.test_neutrality(sfs, reps=1000000))
print('Tajimas D = ', selectiontest.calculate_D(sfs))

Rho =  -1.7842814054136191
Tajimas D =  0.9777780110383649
