## ACKR1 Gene (FY, DARC, Duffy)

For gene details, including GRCh37 coordinates, see 
https://uswest.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000213088;ph=31051;r=1:159203307-159206500

For specific information on the 'Duffy-null allele' rs2814778 see Hodgson et. al at https://royalsocietypublishing.org/doi/full/10.1098/rspb.2014.0930

In [20]:
import os
import numpy as np
import pysam
from vcf import Reader        # https://pypi.org/project/PyVCF/
from Bio import SeqIO
from pyliftover import LiftOver
from cogent3 import make_table
from selectiontest import selectiontest
import gzip, pickle
import pandas as pd


path = "/Users/helmutsimon/"
if not os.getcwd() == path:
    os.chdir(path)
    
print('selectiontest version: ', selectiontest.__version__)

selectiontest version:  0.3.22


We examine the hypothesis that there is a signal of selection in populations for which the 'Duffy-null allele' rs2814778 segregates. In this cell we compute Tajima's D and $\rho $ for the ACKR1 gene for all populations. Note that the 1KG data uses GRCh37 coordinates.

In [None]:
chrom = 1
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
pops = list(set(panel_all['pop']))
result_dict = dict()
for pop in pops:
    panel = panel_all[panel_all['pop'] == pop]
    #Use GRCh37 coordinates for ACKR1 gene.
    sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, 1, 159173097, 159176290)
    tajd = selectiontest.calculate_D(sfs)
    rho = selectiontest.test_neutrality(sfs, reps=200000)
    result_dict[pop] = [tajd, rho]
results = pd.DataFrame(result_dict, index=['tajd', 'rlnt'])
results.to_csv('Google Drive/Genetics/Bayes SFS/Neutrality test/gene_ACKR1_1chrom.csv')
results

Find populations with selection according to $\rho $. All are east Asian, African or African ancestry (Barbados).

In [None]:
threshold = 0.5
selection_tab = results.loc[:,results.loc['rlnt'] > threshold]
select_pops = selection_tab.columns
print(len(pops), len(select_pops))
select_pops

Find populations in which the 'Duffy-null allele' rs2814778 occurs.

In [None]:
#First find hg19 coords for the variant
lo = LiftOver('hg38', 'hg19')
rs2814778_hg19loc = lo.convert_coordinate('chr1', 159204893)
print(rs2814778_hg19loc)

chrom = 1
vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
snps = vcf_file.fetch(str(chrom), rs2814778_hg19loc[0][1] - 5, rs2814778_hg19loc[0][1] + 5)
for snp in snps:
    if snp.ID == 'rs2814778':
        break
print(snp.ID, snp)
probands = list()
for proband in snp.samples:
    gt = proband.gt_alleles
    if int(gt[0]) + int(gt[1]) > 0:
    #print(proband)
        probands.append(proband.sample)
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
panel_all.head()
seg_pops = list(set(panel_all.loc[probands]['pop']))
print(len(seg_pops))
seg_pops

What is relation between populations undergoing selection and segregating for the 'Duffy-null allele' rs2814778. We find that rs2814778 occurs in all African populations undergoing selection, but in none of the east Asian populations.

In [None]:
print(set(seg_pops) - set(select_pops))
print(set(select_pops) - set(seg_pops))

Pool Asian and African populations and attempt to narrow down where selection signal occurs, looking at 800kb segments.

In [None]:
def analyse_region(chrom, start_hg19, interval, pops):
    fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
    panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
    vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                    + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
    vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
    panel = panel_all[panel_all['pop'].isin(pops)]
    reps = 10000
    tajd_results = list()
    rho_results = list()
    for segment in range(4):
        seg_start = start_hg19 + segment * interval
        seg_end = seg_start + interval
        sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, 1, seg_start, seg_end)
        tajd = selectiontest.calculate_D(sfs)
        tajd_results.append(tajd)
        rho = selectiontest.test_neutrality(sfs, reps=200000)
        rho_results.append(rho)
        print(pop, rho, tajd)
    return rho_results, tajd_results
            
chrom = 1
start_hg19 = 159173097     # 159176290
interval = 800
results = pd.DataFrame()
pops = ['YRI', 'LWK', 'GWD', 'MSL', 'ESN', 'ASW', 'ASW', 'ACB']
#pops = ['LWK', 'GWD', 'ESN', 'ACB']
rho_results, tajd_results = analyse_region(chrom, start_hg19, interval, pops)
results['afr_rho'] = rho_results
results['afr_tajd'] = tajd_results
print('\n')
pops = ['JPT', 'BEB', 'CHS', 'KHV', 'CDX', 'CHB']
#pops = ['JPT', 'CHS',  'CDX']
rho_results, tajd_results = analyse_region(chrom, start_hg19, interval, pops)
results['asia_rho'] = rho_results
results['asia_tajd'] = tajd_results
results

Format result as Latex table.

In [None]:
result = results.copy()
result.columns = pd.MultiIndex.from_arrays([['Africa', 'Africa', 'Asia', 'Asia'], \
                                            ['rho', 'tajd', 'rho', 'tajd']])
newix = [str((start_hg19 + interval * i)) for i in range(4)]
result.index = newix
print(result)
result.insert(loc=0, column='Start of 800-bps segment', value=newix)
t = make_table(data_frame=result, title="caption", \
               header = ['a\\b', 'a\\b', 'a\\b', 'a\\b'], digits=2)
t.write("Downloads/duffy_sep20.tex", label="tab:duffy", justify="lcccc")

Calculate appropriate thresholds for $\rho$.

In [None]:
t1 = selectiontest.compute_threshold(600, 10)
t2 = selectiontest.compute_threshold(600, 5)
print(t1, t2)

Identify SNPs in east Asian populations.

In [None]:
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
pops = ['CDX', 'CHS', 'JPT']
panel = panel_all[panel_all['pop'].isin(pops)]
snps = vcf_file.fetch('1', 159173897, 159174697)
count = 0
seg_snps = list()
for record in snps:
    if record.is_snp:
        count += 1
    for proband in record.samples:
        if proband.sample in panel.index:
            gt = proband.gt_alleles
            if int(gt[0]) + int(gt[1]) > 0:
                seg_snps.append(record.ID)
            
seg_snps_asia = list(set(seg_snps))
print(count)
seg_snps_asia

Identify SNPs in African populations.

In [None]:
seg_snps_asia = seg_snps
pops = ['GWD', 'LWK', 'ESN', 'ACB']
panel = panel_all[panel_all['pop'].isin(pops)]
snps = vcf_file.fetch('1', 159173897, 159174697)
count = 0
seg_snps = list()
for record in snps:
    if record.is_snp:
        count += 1
    for proband in record.samples:
        if proband.sample in panel.index:
            gt = proband.gt_alleles
            if int(gt[0]) + int(gt[1]) > 0:
                seg_snps.append(record.ID)
seg_snps_afr = list(set(seg_snps))
print(count)
print(seg_snps_afr)

In [None]:
print(set(seg_snps_asia) - set(seg_snps_afr))
print(set(seg_snps_afr) - set(seg_snps_asia))

We check that the ancestral allele T for rs2814778 is a high-confidence call.

In [None]:
pos37 = 159174683 - 1
anc_filename = 'Data sets/human_ancestor_GRCh37_e59/human_ancestor_1.fa'
for seq_record in SeqIO.parse(anc_filename, "fasta"):
    print(seq_record.id)
    print(seq_record[pos37 - 5: pos37 +5].seq)

Look for signal of selection in African populations for the gene region chr1:159173097-159176290. We compute likelihood ratio for two models M_0 being a selective model as in Hamlin (2000) and M_1 a neutral model. We use an approximation for the sample from M_0, from SFS samples generated by roc_simulation.py.

In [25]:
chrom = 1
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                    + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
   
#Use GRCh37 coordinates for FY gene.
start = 159173097
end = 159176290
reps = 100000
for pop, popx in zip(['CHS'], ['CHS']):
    panel = panel_all[panel_all['pop'] == pop]
    n = panel.shape[0]
    fname = '/Users/helmutsimon/Google Drive/Genetics/Software/msms/lib/data/sfs_non_neutral_duffy' + \
                popx + '11.pklz'
    with gzip.open(fname, 'rb') as q0:  
        q0 = pickle.load(q0)
    row_sums = q0.sum(axis=1)
    variates0 = q0 / row_sums[:, np.newaxis]
    
    q1 = np.empty((reps, n - 1), dtype=float)
    for i, q in enumerate(selectiontest.sample_wf_distribution(n, reps)):
                q1[i] = q

    sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, 1, start, end)
    odds_ratio = selectiontest.test_neutrality(sfs, variates0=variates0, variates1=q1)
    print(pop, start, n, sum(sfs), "%.4f" % odds_ratio, "%.4f" % 10 ** odds_ratio)

CHS 159173097 105 10 -0.9137 0.1220


In [24]:
pop = 'CDX'
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
panel = panel_all[panel_all['pop'] == pop]
panel.shape[0]

93