## Validate genes Chromosome 2q11.1 with demography

Note that the HapMap project cited uses reference human genome assembly version 34.3 (Supplementary information referenced above), which is equivalent to UCSC version hg16 (https://en.wikipedia.org/wiki/Reference_genome#Human_reference_genome)

In [1]:
import os
import numpy as np
import pandas as pd
from selectiontest import selectiontest
import pysam
from vcf import Reader        # https://pypi.org/project/PyVCF/
from pyliftover import LiftOver

In [2]:
def test_segment(vcf_file, panel, chrom, start, end, reps=10000):
    lo = LiftOver('hg38', 'hg19')
    start_hg19 = lo.convert_coordinate('chr2', start)[0][1]
    end_hg19   = lo.convert_coordinate('chr2', end)[0][1]
    print('hg19 (1KG) coordinates: ', start_hg19, end_hg19, "(%d)" % (end_hg19 - start_hg19))
    sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, str(chrom), start_hg19, end_hg19, select_chr=True)
    print('Sample size       =', n)
    print('Segregating sites =', np.sum(sfs))
    tajd = selectiontest.calculate_D(sfs)
    print('Tajimas D         =', tajd)
    rho = selectiontest.test_neutrality(sfs, reps=reps)
    print('\u03C1                 =', rho)
    
def test_segment_demog(vcf_file, panel, chrom, start, end, reps=10000):
    print('Out of Africa demographic history.')
    lo = LiftOver('hg38', 'hg19')
    start_hg19 = lo.convert_coordinate('chr2', start)[0][1]
    end_hg19   = lo.convert_coordinate('chr2', end)[0][1]
    print('hg19 (1KG) coordinates: ', start_hg19, end_hg19, "(%d)" % (end_hg19 - start_hg19))
    sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, str(chrom), start_hg19, end_hg19, select_chr=True)
    print('Sample size       =', n)
    print('Segregating sites =', np.sum(sfs))
    tajd = selectiontest.calculate_D(sfs)
    print('Tajimas D         =', tajd)
    
    pop_sizes = [6.6e3, 3.3e3, 1e4]
    timepoints = [0, 500, 1500]
    variates0 = np.empty((reps, n - 1), dtype=float)
    for i, y in enumerate(selectiontest.piecewise_constant_variates(n, timepoints, pop_sizes, reps)):
        variates0[i] = y[0]
    
    rho = selectiontest.test_neutrality(sfs, variates0=variates0, reps=reps)
    print('\u03C1                 =', rho)

In [3]:
path = "/Users/helmutsimon/"
if not os.getcwd() == path:
    os.chdir(path)
    
chrom = 2
fname = 'Data sets/1KG variants full/integrated_call_samples_v3.20130502.ALL.panel'
panel_all = pd.read_csv(fname, sep=None, engine='python', skipinitialspace=True, index_col=0)
vcf_filename = 'Data sets/1KG variants full/ALL.chr' + str(chrom) \
                + '.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file = Reader(filename=vcf_filename, compressed=True, encoding='utf-8')
all_pops = ['YRI', 'LWK', 'GWD', 'MSL', 'ESN', 'ASW', 'ACB', 'CEU', 'TSI', 'FIN', 'GBR', 'IBS']
afr_pops = ['YRI', 'LWK', 'GWD', 'MSL', 'ESN', 'ASW', 'ACB']
eur_pops = ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']
reps = 100000

STARD7 Sub-region A, Intron 1 for population GWD

In [4]:
start = 96195549  #GRChr38 coordinates obtained from Ensembl
end =   96208144 
panel = panel_all[panel_all['pop'] == 'GWD']
test_segment(vcf_file, panel, chrom, start, end, reps)

hg19 (1KG) coordinates:  96861287 96873882 (12595)
Sample size       = 113
Segregating sites = 29
Tajimas D         = -2.454852459684053
ρ                 = 4.287532650676189


STARD7 Sub-region A, Intron 1 for population CEU

In [5]:
start = 96195549  #GRChr38 coordinates obtained from Ensembl
end =   96208144 
panel = panel_all[panel_all['pop'] == 'CEU']
test_segment_demog(vcf_file, panel, chrom, start, end, reps)

Out of Africa demographic history.
hg19 (1KG) coordinates:  96861287 96873882 (12595)
Sample size       = 99
Segregating sites = 11
Tajimas D         = -1.5004812594200159
ρ                 = 0.5772840075699692


TMEM127 Sub-region B, for population GWD

In [6]:
start = 96248515 
end =   96265994
panel = panel_all[panel_all['pop'] == 'GWD']
test_segment(vcf_file, panel, chrom, start, end, reps)

hg19 (1KG) coordinates:  96914253 96931732 (17479)
Sample size       = 113
Segregating sites = 40
Tajimas D         = -2.3997298507032325
ρ                 = 3.8998338365862217


STARD7-AS1 Sub-region B, for population GWD

In [7]:
start = 96208415 
end =   96242621
panel = panel_all[panel_all['pop'] == 'GWD']
test_segment(vcf_file, panel, chrom, start, end, reps)

hg19 (1KG) coordinates:  96874153 96908359 (34206)
Sample size       = 113
Segregating sites = 110
Tajimas D         = -2.5665053056743026
ρ                 = 10.244592320492602


TMEM127 Sub-region B, for population CEU

In [8]:
start = 96248515 
end =   96265994
panel = panel_all[panel_all['pop'] == 'CEU']
test_segment_demog(vcf_file, panel, chrom, start, end, reps)

Out of Africa demographic history.
hg19 (1KG) coordinates:  96914253 96931732 (17479)
Sample size       = 99
Segregating sites = 21
Tajimas D         = -1.897383518956373
ρ                 = 2.915716762685685


STARD7-AS1 Sub-region B, for population CEU

In [9]:
start = 96208415 
end =   96242621
panel = panel_all[panel_all['pop'] == 'CEU']
test_segment_demog(vcf_file, panel, chrom, start, end, reps)

Out of Africa demographic history.
hg19 (1KG) coordinates:  96874153 96908359 (34206)
Sample size       = 99
Segregating sites = 40
Tajimas D         = -1.3086581806809876
ρ                 = -3.7133636532775434


ENSR00000120257 promoter Sub-region C, for population GWD

In [10]:
start = 96320000
end =   96322601
panel = panel_all[panel_all['pop'] == 'GWD']
test_segment(vcf_file, panel, chrom, start, end, reps)

hg19 (1KG) coordinates:  96985738 96988339 (2601)
Sample size       = 113
Segregating sites = 9
Tajimas D         = -1.9802965518807067
ρ                 = 2.012313816310838


ENSR00000120257 promoter Sub-region C, for population CEU

In [11]:
start = 96320000
end =   96322601
panel = panel_all[panel_all['pop'] == 'CEU']
test_segment_demog(vcf_file, panel, chrom, start, end, reps)

Out of Africa demographic history.
hg19 (1KG) coordinates:  96985738 96988339 (2601)
Sample size       = 99
Segregating sites = 2
Tajimas D         = 0.2382482333571114
ρ                 = -0.3312400646377731


NEURL3 Sub-region D, for population GWD

In [12]:
start = 96497642
end = 96508109
panel = panel_all[panel_all['pop'] == 'GWD']
test_segment(vcf_file, panel, chrom, start, end, reps)

hg19 (1KG) coordinates:  97163379 97173846 (10467)
Sample size       = 113
Segregating sites = 30
Tajimas D         = -1.6974948585884426
ρ                 = 1.9998538910896837


NEURL3 Sub-region D, for population CEU

In [13]:
start = 96497642
end = 96508109
panel = panel_all[panel_all['pop'] == 'CEU']
test_segment_demog(vcf_file, panel, chrom, start, end, reps)

Out of Africa demographic history.
hg19 (1KG) coordinates:  97163379 97173846 (10467)
Sample size       = 99
Segregating sites = 14
Tajimas D         = -1.8050690808287446
ρ                 = 1.9864976685825502
