In [0]:
import os
from collections import OrderedDict,Counter
from __future__ import division
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as mtick
%matplotlib inline
import os.path as op
import os.listdir as ls

In [0]:
#I made these files in 6_pca.ipynb, they are symlinked to the hierfstat folder
#genotypes are coded based on minor allele
hierf = '/home/lindb/wbp/hierfstat/imputed/imputed_z12_maf_swp_trans.txt'

In [0]:
hierfdf = pd.read_csv(hierf,header=0,index_col=0,sep="\t")

In [0]:
hierfdf.head()

In [0]:
hierfdf.to_csv('/home/lindb/wbp/hierfstat/imputed/imputed_z12_maf_swp_trans_NOIDX.txt',header=True,index=False,sep="\t")

# Put into R 

```R

setwd("/home/lindb/wbp/hierfstat/imputed/")
get_varcomp = function(x) 
{
    library(hierfstat)
    loci = data.frame(x)
    res <- varcomp(cbind(levels, loci),diploid=T)$overall
}

finish_varcomp = function(m) 
{
    tot <- apply(m, 2, sum, na.rm = TRUE)
    nblevels <- length(tot)
    f <- matrix(rep(0, (nblevels - 1)^2), ncol = (nblevels - 1))
    for (i in 1:(nblevels - 1)) 
    {
        for (j in i:(nblevels - 1)) 
        {
            f[i, j] <- sum(tot[i:j])/sum(tot[i:nblevels])
        }
    }
    row.names(m) <- lnames
    print(names(tot))
    tf <- t(f)
    row.names(tf) <- fnames
    f <- t(tf)
    row.names(f) <- c("Total", fnames[-length(fnames)])
    return(list(loc = m, overall = tot, F = f))
}


library(hierfstat)
library(data.table)
library(snow)
data = data.frame(fread("imputed_z12_maf_swp_trans_NOIDX.txt", header=T, sep="\t"))
levels = data.frame(data[,1:2])
loci = data[,3:ncol(data)]
lnames=names(loci)
fnames=c(names(levels), "Ind")
cl = makeSOCKcluster(50)
clusterExport(cl, "levels", envir=environment())
system.time(res <- matrix(parCapply(cl, loci, get_varcomp), nrow=length(names(loci)),byrow=T))
res = finish_varcomp(res)
saveRDS(res, "hierfstatRUN_imputed.rds")
system.time(bs <- basic.stats(data))
saveRDS(bs, "bs_hierfstatRUN_imputed.rds")
stopCluster(cl)
print("Done!")


```

In [0]:
def setup_r(): 
    os.environ['R_HOME'] = '/home/lindb/g/R3/lib64/R/' 
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'])

In [0]:
setup_r()
import readline
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri as pd2ri
pd2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext rpy2.ipython
%matplotlib inline

In [0]:
%%R
sessionInfo()

In [0]:
def get_r_series(key):
    s = pd.Series(get_r(key))
    s.index = get_r("names(%s)" % key)
    return s

def get_r_df(key):
    df = pd.DataFrame(get_r(key))
    try:
        rname = get_r("rownames(%s)" % key)
        df.index = rname
    except:
        pass
    
    try:
        cname = get_r("colnames(%s)" % key)
        df.columns = cname
    except:
        pass
    
    return df

def get_r(key):
    return r(key)

In [0]:
%%R
res = readRDS("/home/lindb/wbp/hierfstat/imputed/hierfstatRUN_imputed.rds")

In [0]:
%%R
bs = readRDS("/home/lindb/wbp/hierfstat/imputed/bs_hierfstatRUN_imputed.rds")

In [0]:
resF = get_r_df("res$F")
resF

In [0]:
bs = get_r_df("bs$perloc")
bs.head()

In [0]:
overall = get_r_df("res$overall")
overall

In [0]:
loc = get_r_df("res$loc")
loc.head()

In [0]:
#multilocus FST
def compute_F_plotTotal(series):
    Hs = series[0]+series[1]
    Ht = sum(series)
    return Hs/Ht

In [0]:
plot_total_fst = loc.apply(compute_F_plotTotal, axis=1)
plot_total_fst.shape,plot_total_fst.head()

In [0]:
plt.hist(plot_total_fst, bins = [x for x in np.linspace(-0.1,0.15,30)])[2]

In [0]:
def compute_F_popTotal(series):
    return series[0]/sum(series)

In [0]:
pop_total_fst = loc.apply(compute_F_popTotal, axis = 1)
pop_total_fst.shape,pop_total_fst.head()

In [0]:
plt.hist(pop_total_fst, bins = [x for x in np.linspace(-0.1,0.1,30)])[2]

In [0]:
plt.hist(bs['Fst'].tolist(),bins =[x for x in np.linspace(-0.1,0.1,30)])[2]

In [0]:
def compute_F_plotPop(series):
    return series[1]/sum(series[1:])

In [0]:
plot_pop_fst = loc.apply(compute_F_plotPop,axis=1)
plot_pop_fst.shape,plot_pop_fst.head()

In [0]:
plt.hist(plot_pop_fst, bins = [x for x in np.linspace(-0.11,0.11,30)])[2]

In [0]:
dfs = [pop_total_fst,plot_total_fst,plot_pop_fst]
ds = ["pop_total","plot_total","plot_pop"]
for i,d in enumerate(dfs):
    d = pd.DataFrame(d)
    d.columns = [ds[i]]
    if i == 0:
        df = pd.DataFrame(d)
    else:
        df = pd.merge(df,d,left_index=True,right_index=True)
df.head()

In [0]:
locDict = OrderedDict()
j = 0
for i,SNP in enumerate(hierfdf.columns[2:]):
    l = len([x for x in hierfdf.loc[:,SNP] if x > 0])
    pthere = l/len(hierfdf.index)
    locDict[SNP] = pthere
    j+=1
    if j % 1000 == 0:
        print j

In [0]:
perc = pd.DataFrame()
perc[0] = locDict.values()
perc.index = locDict.keys()
perc.columns = ['perc data']
perc.head()

In [0]:
perc.shape

In [0]:
df = pd.merge(df,perc,left_index=True,right_index=True)
df.head()

In [0]:
filE = '/home/lindb/wbp/hierfstat/imputed/imputed_hierarchical_Fstats.txt'
df.to_csv(filE, header=True,index=True,sep="\t")

In [0]:
filE = '/home/lindb/wbp/hierfstat/imputed/imputed_hierarchical_Fstats.txt'
df = pd.read_csv(filE,header=0,index_col=0,sep='\t')
df.head()

In [0]:
min(df['perc data']),max(df['perc data'])

# figures

In [0]:
df.head()

In [0]:
plt.close('all')
fig  = plt.figure(figsize=(5,5),dpi=400)
a1 = plt.subplot(111)
a1.hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
a1.spines['right'].set_visible(False)
a1.spines['top'].set_visible(False)
a1.yaxis.set_ticks_position('left')
a1.xaxis.set_ticks_position('bottom')
a1.set_xlabel(r'single-locus $F_{ST}$',fontsize=14)
a1.set_ylabel('Count',fontsize=14)

fig.set_size_inches(4,4)
plt.savefig('/home/lindb/wbp/figures/multilocus_fst_distribution.pdf',format='pdf',bbox_inches='tight')

In [0]:
with PdfPages('/home/lindb/wbp/figures/multilocus_fst_distribution.pdf') as pdf:
    plt.close('all')
    fig  = plt.figure(figsize=(5,5),dpi=400)
    a1 = plt.subplot(111)
    a1.hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
    a1.spines['right'].set_visible(False)
    a1.spines['top'].set_visible(False)
    a1.yaxis.set_ticks_position('left')
    a1.xaxis.set_ticks_position('bottom')
    a1.set_xlabel(r'single-locus $F_{ST}$',fontsize=14)
    a1.set_ylabel('Count',fontsize=14)
    
    fig.set_size_inches(4,4)
    pdf.savefig(fig,bbox_inches='tight')

In [0]:
?plt.savefig

In [0]:
min(df['plot_total']),max(df['plot_total'])

In [0]:
df['plot_total'].describe()

In [0]:
print 'low',0.006976-(1.96*0.018286)
print 'high',0.006976+(1.96*0.018286)

# plot abline of sig snps

### OutFLANK

In [0]:
outflank = '/home/lindb/wbp/OutFLANK/OutFLANK_snps.txt'
outdf = pd.read_csv(outflank,header=0,sep='\t')
outflanksnps = outdf['x'].tolist()
len(outflanksnps)

In [0]:
dd['NODE_1000013_length_91_cov_1.802198_37']

In [0]:
dd = OrderedDict(df['plot_total'])

In [0]:
vals = [dd[snp] for snp in outflanksnps]
len(vals)

In [0]:
with PdfPages('/home/lindb/wbp/figures/multilocus_fst_distribution_outflank_overlay.pdf') as pdf:
    plt.close('all')
    fig  = plt.figure(figsize=(5,5),dpi=400)
    a1 = plt.subplot(111)
    a1.hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
    a1.spines['right'].set_visible(False)
    a1.spines['top'].set_visible(False)
    a1.yaxis.set_ticks_position('left')
    a1.xaxis.set_ticks_position('bottom')
    a1.set_xlabel(r'multilocus $F_{ST}$',fontsize=14)
    a1.set_ylabel('Count',fontsize=14)
    
    [a1.axvline(x=dd[snp],c='red',linewidth=0.25,zorder=1) for snp in outflanksnps]
    #a1.axvline(x=dd[dd.keys()[0]],c='red',linewidth=0.25,zorder=1)
    
    fig.set_size_inches(4,4)
    pdf.savefig(fig,bbox_inches='tight')

### bayenv2

In [0]:
import os.listdir as ls
import os.path as op

In [0]:
#get bayenv2 snps
DIR = '/home/lindb/wbp/bayenv2/results/sigsnps'
bayfs = [op.join(DIR,f) for f in ls(DIR)]
baydict = {}
for f in bayfs:
    env = op.basename(f).split("_")[0]
    df = pd.read_csv(f,header=0,sep='\t')
    baydict[env] = df[env].tolist()
    print env,len(baydict[env])

In [0]:
round(0.95*116231)

In [0]:
n5th = df['plot_total'].quantile(0.95)
n5th

In [0]:
n99th = df['plot_total'].quantile(0.999)
n99th

In [0]:
/home/lindb/wbp/figures/multilocus_fst_distribution_bayenv_Ann-ppt_overlay.pdf

In [0]:
for env in baydict.keys():
    print env
    with PdfPages('/home/lindb/wbp/figures/multilocus_fst_distribution_bayenv_%s_overlay.pdf' % env) as pdf:
        plt.close('all')
        fig  = plt.figure(figsize=(5,5),dpi=400)
        a1 = plt.subplot(111)
        a1.hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
        a1.spines['right'].set_visible(False)
        a1.spines['top'].set_visible(False)
        a1.yaxis.set_ticks_position('left')
        a1.xaxis.set_ticks_position('bottom')
        a1.set_xlabel(r'multilocus $F_{ST}$',fontsize=14)
        a1.set_ylabel('Count',fontsize=14)

        #[a1.axvline(x=dd[snp],c='red',linewidth=0.25,zorder=1) for snp in baydict[env]]
        for snp in baydict[env]:
            col = 'red'
            if dd[snp] >= n5th:
                col = 'purple'
            if dd[snp] >= n99th:
                col = 'blue'
            a1.axvline(x=dd[snp],c=col,linewidth=0.25,zorder=1)
        #a1.axvline(x=dd[dd.keys()[0]],c='red',linewidth=0.25,zorder=1)

        fig.set_size_inches(4,4)
        pdf.savefig(fig,bbox_inches='tight')
        plt.show()


In [0]:
for env in sorted(baydict):
    rcount = 0
    pcount = 0
    bcount = 0
    for snp in baydict[env]:
        if dd[snp] < n5th:
            rcount += 1
        elif dd[snp] < n99th:
            pcount += 1
        elif dd[snp] >= n99th:
            bcount += 1
    print env,rcount,pcount,bcount,rcount+pcount+bcount,len(baydict[env])

In [0]:
with PdfPages('/home/lindb/wbp/figures/multilocus_fst_distribution_bayenv_all_env_overlay.pdf') as pdf:
    
    plt.close('all')
    fig, ((a1,a2,a3),(a4,a5,a6),(a7,a8,a9)) = plt.subplots(3, 3, figsize=(5,5),dpi=400)
    
    plotdict = {}
    count = 1
    for env in sorted(baydict.keys())[0:9]:
        plotdict[count] = plt.subplot(int("33%s" % count))

        plotdict[count].hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
        plotdict[count].spines['right'].set_visible(False)
        plotdict[count].spines['top'].set_visible(False)
        plotdict[count].yaxis.set_ticks_position('left')
        plotdict[count].xaxis.set_ticks_position('bottom')
        plotdict[count].set_title(lst[count-1],y=.9,loc='right',fontsize=17,fontweight='bold')
        
        if count in [7,8,9]:
            plotdict[count].set_xlabel(r'multilocus $F_{ST}$',fontsize=14)
        if count in [1,4,7]:
            plotdict[count].set_ylabel('Count',fontsize=14)
        
        for snp in baydict[env]:
            col = 'red'
            if dd[snp] >= n5th:
                col = 'purple'
            if dd[snp] >= n99th:
                col = 'blue'
            plotdict[count].axvline(x=dd[snp],c=col,linewidth=0.25,zorder=1)
        
        fig.set_size_inches(13,10)
        pdf.savefig(fig,bbox_inches='tight')
        
        count += 1

In [0]:
sorted(baydict.keys())[9:]

In [0]:
for env in sorted(baydict):
    print env,len(baydict[env])

In [0]:
with PdfPages('/home/lindb/wbp/figures/multilocus_fst_distribution_bayenv_all_env_overlay_2.pdf') as pdf:
    
    plt.close('all')
    fig, ((a1,a2,a3),(a4,a5,a6),(a7,a8,a9)) = plt.subplots(3, 3, figsize=(5,5),dpi=400)
    
    plotdict = {}
    count = 1
    for env in sorted(baydict.keys())[9:]:
        plotdict[count] = plt.subplot(int("33%s" % count))

        plotdict[count].hist(df['plot_total'],bins = [x for x in np.linspace(-0.1,0.2,30)])
        plotdict[count].spines['right'].set_visible(False)
        plotdict[count].spines['top'].set_visible(False)
        plotdict[count].yaxis.set_ticks_position('left')
        plotdict[count].xaxis.set_ticks_position('bottom')
        plotdict[count].set_title(lst[count-1],y=.9,loc='right',fontsize=17,fontweight='bold')
        
        if count in [7,8,9]:
            plotdict[count].set_xlabel(r'multilocus $F_{ST}$',fontsize=14)
        if count in [1,4,7]:
            plotdict[count].set_ylabel('Count',fontsize=14)
        
        for snp in baydict[env]:
            col = 'red'
            if dd[snp] >= n5th:
                col = 'purple'
            if dd[snp] >= n99th:
                col = 'blue'
            plotdict[count].axvline(x=dd[snp],c=col,linewidth=0.25,zorder=1)
        
        fig.set_size_inches(13,10)
        pdf.savefig(fig,bbox_inches='tight')
        
        count += 1