# Title

In [4]:
import re, os, sys, pickle, pickle
from pathlib import Path
import numpy
import pandas
from pandas import DataFrame, Series
from sklearn.decomposition import PCA

# my own libaries
from GenomicWindows import window
import GenomicIntervals

numpy.random.seed(7)

In [2]:
# import ipyparallel
# from ipyparallel import depend, require

# # create client & view
# rc = ipyparallel.Client()
# dview = rc[:]
# bview = rc.load_balanced_view() # you can set block=True to have this apply globally

# # scatter 'id', so id=0,1,2 on engines 0,1,2
# dview.scatter('node_id', rc.ids, flatten=True)
# print("Engine IDs: ", dview['node_id'])
# # create a Reference to `id`. This will be a different value on each engine
# ref = ipyparallel.Reference('node_id')

# with dview.sync_imports(): # imports libs across dview
#     import re, os, sys, pickle
#     from pathlib import Path
#     import numpy
#     import pandas
#     from pandas import DataFrame
    
# dview.execute('numpy.random.seed(7)')

# def scatter_func(f):
#     dview.scatter(f.__name__, [f]*len(bview), flatten=True)    

# def getmem():
#     p = subprocess.Popen("echo $HOSTNAME", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#     stdout, stderr = p.communicate()
#     return node_id, stdout.decode().strip(), resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024

# def print_mem():
#     for node_id, host_name, mem in  dview.apply(getmem).get():
#         print("{}{: >10}{: 10.2f} Mb". format(node_id, host_name, mem))

# def parallel_apply(grouped_df, fun):
#     return pandas.concat(dview.map_sync(fun, (group for name, group in grouped_df)))

Plotting setup:

In [5]:
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D

from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import mpld3

import seaborn as sns
sns.set() # sets seaborn default "prettyness:
sns.set_style("whitegrid")
sns.set_context("paper")

# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess

set1 = {'red': '#e41a1c', 'blue': '#377eb8', 'green': '#4daf4a',
        'purple': '#984ea3', 'orange': '#ff7f00', 
        'yellow': '#ffff33', 'brown': '#a65628'}

solarized = dict(yellow='#b58900', orange='#cb4b16', red='#dc322f', 
                 magenta='#d33682', violet='#6c71c4', blue='#268bd2',
                 cyan='#2aa198', green='#859900')

def get_hex_colors(col_map_name, nr_colors):
    hexcolors = list()
    cmap = plt.cm.get_cmap(col_map_name, nr_colors)
    for i in range(cmap.N):
        rgb = cmap(i)[:3] # will return rgba, we take only first 3 so we get rgb
        hexcolors.append(matplotlib.colors.rgb2hex(rgb))
    return hexcolors

hex_colors = get_hex_colors('Paired', 12)

blue_pair = hex_colors[0:2]
green_pair = hex_colors[2:4]
red_pair = hex_colors[4:6]
orange_pair = hex_colors[6:8]
purple_pair = hex_colors[8:10]
brown_pair = hex_colors[10:12]

import random
df = DataFrame({'x': random.sample(range(1, 100), 5), 
                'y': random.sample(range(1, 100), 5),
                'z': [1,0,0,1,0],
                'k': ['male','male','male','female','female'] })

# with sns.color_palette("Set1"):
#     with sns.axes_style('dark'): # whitegrid, ticks
#         with sns.plotting_context('paper', font_scale=1): # paper, notebook, talk”, poster
#             g = sns.lmplot('x', 'y', data=df, hue='z', col='k', fit_reg=False, size=3)

# mpld3.display(ax.fig) # if you want d3

In [None]:
#g = sns.lmplot('x', 'y', data=df, hue='z', col='k', fit_reg=False, size=3)

Ignore deprecation warnings from mainly seaborn:

In [6]:
# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)

### Analysis dirs

In [7]:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
steps_dir = root_dir / 'steps'
argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
pi_dir = steps_dir / 'pi_stores'
dist_dir = steps_dir / 'dist_stores'
#pi_dir = root_dir / 'old_pi_stores'
male_x_haploid_dir = steps_dir / 'male_x_haploids'

Local code in the scripts dir on the cluster:

In [8]:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import simons_meta_data
import hg19_chrom_sizes

from toggle_code_and_errors import toggle_code_html, toggle_errors_html

Import variables global to the entire analysis:

In [9]:
import analysis_globals

## Convenience functions

In [10]:
def silent_nanmean(x):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return numpy.nanmean(x)
    
def ident_scalar(s):
    x = s.unique()
    assert(len(x)) == 1
    return x[0]

def flatten_column_index(df):
    df.columns = ['_'.join(col).strip() for col in df.columns.values]


## Load meta data

In [11]:
# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=meta_data_dir)

pop_categories = pandas.read_hdf(str(results_dir / 'population_categories.store'), 'sr')
region_categories = pandas.read_hdf(str(results_dir / 'region_categories.store'), 'sr')
# region_colors = dict(zip(list(region_categories), 
#                          ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', 
#                           '#ff7f00', '#ffff33', '#a65628']))

region_colors = dict(zip(list(region_categories), 
                         ['#e41a1c', '#377eb8',  '#984ea3', '#4daf4a',
                          '#ff7f00', '#ffff33', '#a65628']))

chromosome_lengths = dict((k.replace('chr', ''), v) for k, v in hg19_chrom_sizes.hg19_chrom_sizes.items())

In [12]:
import time
class Timer:    
    def __enter__(self):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start
        print("{:.0f} min {:.1f} sec".format(self.interval//60, self.interval % 60))

## Prepare data for Mica's iHH analysis

1. Get region
2. Filter SNPs: remove derived only in swept individuals, or maybe only include non-africans SNPs that are also found in Africans without back flow. 

Filter backflow out of genotypes_AF_formated and do inner join with genotypes_nonafricans_formated on SNP position.

3. Filter individuals: 

You may have to do a less stringent sweep calling to get haplotypes that break close to the selected variant. Try cutoff of 1e-4 and winsize 200 or 100 and step 100. 

Read Mica's table, remove excluded populations and extract positions that segregate in Africa

In [50]:
african_snps = (pandas.read_table(data_dir / 'mica_data/Formated_genotypes/genotypes_AF_formated')
                .melt(id_vars=['SNP_name', 'CHR', 'Position', 'Ancestral', 'Derived'], 
                      var_name='indiv', value_name='base')
                .assign(population=lambda df: df.indiv.str.replace(r'S_(.+)-\d+', lambda m: m.group(1)))
                .loc[lambda df: df.indiv.isin(individuals)]
                .groupby(['Position'])
                .filter(lambda df: len(df.base.unique()) == 2) # only positions that are SNPs in Africa
                .reset_index()
                .assign(group='African')
               )
african_snps.head()

Unnamed: 0,index,SNP_name,CHR,Position,Ancestral,Derived,indiv,base,population,group
0,2,X:X_400108,1,400108,G,A,S_BantuHerero-1,G,BantuHerero,African
1,3,X:X_400132,1,400132,G,T,S_BantuHerero-1,G,BantuHerero,African
2,5,X:X_400303,1,400303,G,C,S_BantuHerero-1,G,BantuHerero,African
3,6,X:X_401564,1,401564,C,T,S_BantuHerero-1,T,BantuHerero,African
4,8,X:X_401651,1,401651,G,C,S_BantuHerero-1,G,BantuHerero,African


In [74]:
non_african_data = (pandas.read_table(data_dir / 'mica_data/Formated_genotypes/genotypes_nonafricans_formated')
                    .reset_index(drop=True)
                   )

**HACK: Mica did not add the position column to the non-afrian data so we add it from the WE frame...**

In [75]:
we_data = (pandas.read_table(data_dir / 'mica_data/Formated_genotypes/genotypes_WE_formated')
                    .reset_index(drop=True)
                   )
non_african_data['Position'] = we_data.Position
non_african_data.head()

Unnamed: 0,S_Abkhasian-1,S_Abkhasian-2,S_Adygei-1,S_Aleut-1,S_Ami-1,S_Ami-2,S_Armenian-1,S_Armenian-2,S_Balochi-1,S_Balochi-2,...,S_Xibo-1,S_Xibo-2,S_Yadava-1,S_Yadava-2,S_Yakut-2,S_Yemenite_Jew-2,S_Yi-1,S_Zapotec-1,S_Zapotec-2,Position
0,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,365712
1,T,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,400000
2,A,A,A,A,A,G,A,A,A,A,...,A,G,A,G,A,A,A,A,A,400108
3,T,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,400132
4,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,400148


In [108]:
non_african_snps = (non_african_data
                    .melt(id_vars=['Position'], var_name='indiv', value_name='base')
                    .assign(population=lambda df: df.indiv.str.replace(r'S_(.+)-\d+', lambda m: m.group(1)))
                    .loc[lambda df: df.indiv.isin(individuals)]
                    .assign(group='non-African')
                   )
non_african_snps.head()

Unnamed: 0,Position,indiv,base,population,group
0,365712,S_Abkhasian-1,A,Abkhasian,non-African
1,400000,S_Abkhasian-1,T,Abkhasian,non-African
2,400108,S_Abkhasian-1,A,Abkhasian,non-African
3,400132,S_Abkhasian-1,T,Abkhasian,non-African
4,400148,S_Abkhasian-1,A,Abkhasian,non-African


In [109]:
afr_df = african_snps.pivot(index='Position', columns='indiv', values='base').reset_index()
del afr_df.columns.name # ask Dan why indiv is made row index name????

In [110]:
non_afr_df = non_african_snps.pivot(index='Position', columns='indiv', values='base').reset_index()
del non_afr_df.columns.name # ask Dan why indiv is made row index name????

In [111]:
len(afr_df.loc[:, ['Position']])

224822

In [112]:
df = non_afr_df.merge(afr_df.loc[:, ['Position']], on=['Position'], how='inner')
df.head()

Unnamed: 0,Position,S_Abkhasian-1,S_Abkhasian-2,S_Adygei-1,S_Aleut-1,S_Ami-1,S_Ami-2,S_Armenian-1,S_Armenian-2,S_Balochi-1,...,S_Uygur-2,S_Xibo-1,S_Xibo-2,S_Yadava-1,S_Yadava-2,S_Yakut-2,S_Yemenite_Jew-2,S_Yi-1,S_Zapotec-1,S_Zapotec-2
0,400108,A,A,A,A,A,G,A,A,A,...,A,A,G,A,G,A,A,A,A,A
1,400132,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,T
2,400303,C,G,G,C,G,C,G,G,C,...,G,G,C,G,C,G,G,C,C,C
3,401564,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,T
4,401651,C,C,G,C,C,C,C,C,C,...,C,G,C,C,C,G,G,C,C,C


In [113]:
start, end = 100000000, 101000000
df.loc[lambda df: (df.Position >= start) & (df.Position < end)]


Unnamed: 0,Position,S_Abkhasian-1,S_Abkhasian-2,S_Adygei-1,S_Aleut-1,S_Ami-1,S_Ami-2,S_Armenian-1,S_Armenian-2,S_Balochi-1,...,S_Uygur-2,S_Xibo-1,S_Xibo-2,S_Yadava-1,S_Yadava-2,S_Yakut-2,S_Yemenite_Jew-2,S_Yi-1,S_Zapotec-1,S_Zapotec-2
140066,100000115,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
140067,100000328,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
140068,100000411,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
140069,100000464,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
140070,100000500,T,T,A,A,A,T,A,T,A,...,T,A,A,T,T,T,T,T,A,T
140071,100000719,G,G,A,A,A,G,A,G,A,...,G,A,A,G,G,G,G,G,A,G
140072,100001336,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
140073,100001407,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
140074,100003042,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,T
140075,100003223,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
