In [1]:
import re, os, sys, pickle
from pathlib import Path
import numpy
import pandas
from pandas import DataFrame

# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)

numpy.random.seed(7)

### Analysis dirs

In [2]:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
steps_dir = root_dir / 'steps'
argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
pi_dir = steps_dir / 'pi_stores'
dist_dir = steps_dir / 'dist_stores'
#pi_dir = root_dir / 'old_pi_stores'
male_x_haploid_dir = steps_dir / 'male_x_haploids'

### Local code in the scripts dir on the cluster

In [3]:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import simons_meta_data
import hg19_chrom_sizes


from ChromosomeWindows import window
import genominterv

### Plotting setup

In [4]:
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import mpld3

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper")

# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess

# Load meta data

In [5]:
# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=meta_data_dir)

chromosome_lengths = dict((k.replace('chr', ''), v) for k, v in hg19_chrom_sizes.hg19_chrom_sizes.items())

Load sweep data:

In [6]:
abs_low_region_df = pandas.concat([pandas.read_hdf(results_dir / 'abs_low_region_chrX_df.hdf'),
                                   pandas.read_hdf(results_dir / 'abs_low_region_chr7_df.hdf')])
rel_low_region_df = pandas.concat([pandas.read_hdf(results_dir / 'rel_low_region_chrX_df.hdf'),
                                   pandas.read_hdf(results_dir / 'rel_low_region_chr7_df.hdf')])

Load minimum run length of sweep windows set in sweep calling:

In [7]:
with open(str(results_dir / 'min_run_length.pkl'), 'rb') as f:
    min_run_length = pickle.load(f)

Load auxilary data:

In [8]:
human_chimp_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_chimp_low_ils_regions_chrX.hdf')
human_orang_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_orang_low_ils_regions_chrX.hdf')
ampliconic_regions = pandas.read_hdf(results_dir / 'ampliconic_regions.hdf')
akey_regions = pandas.read_hdf(results_dir / 'akey_regions.hdf')
admix_windows_chrX = pandas.read_hdf(results_dir / 'admix_windows_chrX.hdf')
refseq_genes = pandas.read_hdf(results_dir / 'refseq_genes.hdf')

# Overlap to _absolute_ low pi regions on chrX

### Merged low pi regions

Union of regions across populations

In [11]:
rows = (abs_low_region_df.chrom == 'X') & (abs_low_region_df.run >= min_run_length)
merged_abs_low_regions_chrX = genominterv.interval_collapse(abs_low_region_df.loc[rows])

### Overlap with low ILS regions

In [14]:
@genominterv.bootstrap('hg19', samples=1000)
def jaccard_test(query, annot):
    return genominterv.jaccard(query, annot)

jaccard_test(merged_abs_low_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
            human_chimp_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom))

(0.3271537622682661, 0.002)

### Overlap with ampliconic regions

In [15]:
jaccard_test(merged_abs_low_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
              ampliconic_regions.loc[lambda df: df.chrom == 'X'].assign(chrom=lambda df: 'chr' + df.chrom))

(0.008155194631341988, 0.999)

### Non-random distance to ampliconic reginos not overlapping low pi regions

Are ampliconic regions not overlapping low pi regions closer the low pi regions than expected?

In [31]:
ampl_reg_not_overlapping = genominterv.interval_diff(ampliconic_regions.loc[lambda df: df.chrom == 'X'],
                                                    merged_abs_low_regions_chrX)
genominterv.proximity_test(ampl_reg_not_overlapping, merged_abs_low_regions_chrX.assign(chrom = 'X'))

TestResult(statistic=0.6292727272727274, pvalue=0.0)

# Overlap to _absolute_ low pi regions on chr7

### Merged _absolute_ low pi regions

Union of regions across populations

In [19]:
rows = (abs_low_region_df.chrom == '7') & (abs_low_region_df.run >= min_run_length)
merged_abs_low_regions_chr7 = genominterv.interval_collapse(abs_low_region_df.loc[rows])

### Overlap between merged _absolute_ low pi regions and Akey regions

Probably heavily confounded with gene density

In [20]:
jaccard_test(merged_abs_low_regions_chr7.assign(chrom=lambda df: 'chr' + df.chrom), 
             akey_regions.loc[lambda df: df.chrom == '7'].assign(chrom=lambda df: 'chr' + df.chrom))

p-value is zero smaller than 0.001. Increase nr samples to get actual p-value.


(0.18945738685033495, 0.0)

### Overlap between merged _absolute_ low pi regions and refseq genes

In [21]:
merged_refseq_genes_chr7 = genominterv.interval_collapse(refseq_genes.loc[refseq_genes.chrom ==  '7'])

jaccard_test(merged_abs_low_regions_chr7.assign(chrom=lambda df: 'chr' + df.chrom), 
             merged_refseq_genes_chr7.assign(chrom=lambda df: 'chr' + df.chrom))

(0.17108541467334537, 0.019)

It seems that the regions called on chr7 are indeed subject to sweeps.

# Overlap to _relative_ low pi regions on chrX

### Merged low pi regions

Union of regions across populations

In [23]:
rows = (rel_low_region_df.chrom == 'X') & (rel_low_region_df.run >= min_run_length)
merged_rel_low_regions_chrX = genominterv.interval_collapse(rel_low_region_df.loc[rows])

### Overlap with low ILS regions

In [26]:
jaccard_test(merged_rel_low_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
             human_chimp_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom))

(0.28060522696011003, 0.016)

### Overlap with ampliconic regions

In [28]:
jaccard_test(merged_rel_low_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
             ampliconic_regions.loc[lambda df: df.chrom == 'X'].assign(chrom=lambda df: 'chr' + df.chrom))

(0.0048858724997627315, 0.998)

### Non-random distance to ampliconic reginos not overlapping low pi regions

Are ampliconic regions not overlapping low pi regions closer the low pi regions than expected?

In [30]:
ampl_reg_not_overlapping = genominterv.interval_diff(ampliconic_regions.loc[lambda df: df.chrom == 'X'],
                                                          merged_rel_low_regions_chrX)
genominterv.proximity_test(ampl_reg_not_overlapping, merged_rel_low_regions_chrX.assign(chrom = 'X'))

TestResult(statistic=0.5874400000000005, pvalue=0.0)

# Overlap of low ILS regions on chrX

### Between human-chimp and human-orang low ILS regions

In [None]:
jaccard_test(human_chimp_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
             human_orang_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom))

### Between ampliconic regions and low ILS regions on chrX

In [None]:
jaccard_test(human_chimp_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
             ampliconic_regions.loc[lambda df: df.chrom == 'X'].assign(chrom=lambda df: 'chr' + df.chrom))

In [None]:
jaccard_test(human_orang_low_ils_regions_chrX.assign(chrom=lambda df: 'chr' + df.chrom), 
             ampliconic_regions.loc[lambda df: df.chrom == 'X'].assign(chrom=lambda df: 'chr' + df.chrom))