In [0]:
import os
from collections import OrderedDict,Counter
from __future__ import division
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as mtick
%matplotlib inline
import os.path as op
import os.listdir as ls

In [0]:
#I made these files in 6_pca.ipynb, they are symlinked to the hierfstat folder
#genotypes are coded based on minor allele
hierf = '/home/lindb/wbp/hierfstat/missing/missing_z12_maf_swp_trans.txt'

In [0]:
hierfdf = pd.read_csv(hierf,header=0,index_col=0,sep="\t")

In [0]:
hierfdf.head()

In [0]:
hierfdf.to_csv('/home/lindb/wbp/hierfstat/missing/missing_z12_maf_swp_trans_NOIDX.txt',header=True,index=False,sep="\t")

# Put into R

```R

setwd("/home/lindb/wbp/hierfstat/missing/")
get_varcomp = function(x) 
{
    library(hierfstat)
    loci = data.frame(x)
    res <- varcomp(cbind(levels, loci),diploid=T)$overall
}

finish_varcomp = function(m) 
{
    tot <- apply(m, 2, sum, na.rm = TRUE)
    nblevels <- length(tot)
    f <- matrix(rep(0, (nblevels - 1)^2), ncol = (nblevels - 1))
    for (i in 1:(nblevels - 1)) 
    {
        for (j in i:(nblevels - 1)) 
        {
            f[i, j] <- sum(tot[i:j])/sum(tot[i:nblevels])
        }
    }
    row.names(m) <- lnames
    print(names(tot))
    tf <- t(f)
    row.names(tf) <- fnames
    f <- t(tf)
    row.names(f) <- c("Total", fnames[-length(fnames)])
    return(list(loc = m, overall = tot, F = f))
}


library(hierfstat)
library(data.table)
library(snow)
data = data.frame(fread("missing_z12_maf_swp_trans_NOIDX.txt", header=T, sep="\t"))
levels = data.frame(data[,1:2])
loci = data[,3:ncol(data)]
lnames=names(loci)
fnames=c(names(levels), "Ind")
cl = makeSOCKcluster(50)
clusterExport(cl, "levels", envir=environment())
system.time(res <- matrix(parCapply(cl, loci, get_varcomp), nrow=length(names(loci)),byrow=T))
res = finish_varcomp(res)
saveRDS(res, "hierfstatRUN_missing.rds")
system.time(bs <- basic.stats(data))
saveRDS(bs, "bs_hierfstatRUN_missing.rds")
stopCluster(cl)
print("Done!")


```

In [0]:
def setup_r(): 
    os.environ['R_HOME'] = '/home/lindb/g/R3/lib64/R/' 
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'])

In [0]:
setup_r()
import readline
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri as pd2ri
pd2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext rpy2.ipython
%matplotlib inline

In [0]:
%%R
sessionInfo()

In [0]:
def get_r_series(key):
    s = pd.Series(get_r(key))
    s.index = get_r("names(%s)" % key)
    return s

def get_r_df(key):
    df = pd.DataFrame(get_r(key))
    try:
        rname = get_r("rownames(%s)" % key)
        df.index = rname
    except:
        pass
    
    try:
        cname = get_r("colnames(%s)" % key)
        df.columns = cname
    except:
        pass
    
    return df

def get_r(key):
    return r(key)

In [0]:
%%R
res = readRDS("/home/lindb/wbp/hierfstat/missing/hierfstatRUN_missing.rds")

In [0]:
resF = get_r_df("res$F")
resF

In [0]:
overall = get_r_df("res$overall")
overall

In [0]:
loc = get_r_df("res$loc")
loc.head()

In [0]:
def compute_F_plotTotal(series):
    Hs = series[0]+series[1]
    Ht = sum(series)
    return Hs/Ht

In [0]:
plot_total_fst = loc.apply(compute_F_plotTotal, axis=1)
plot_total_fst.shape,plot_total_fst.head()

In [0]:
plt.hist(plot_total_fst, bins = [x for x in np.linspace(-0.1,0.15,30)])[2]

In [0]:
def compute_F_popTotal(series):
    return series[0]/sum(series)

In [0]:
pop_total_fst = loc.apply(compute_F_popTotal, axis = 1)
pop_total_fst.shape,pop_total_fst.head()

In [0]:
plt.hist(pop_total_fst, bins = [x for x in np.linspace(-0.1,0.1,30)])[2]

In [0]:
def compute_F_plotPop(series):
    return series[1]/sum(series[1:])

In [0]:
plot_pop_fst = loc.apply(compute_F_plotPop,axis=1)
plot_pop_fst.shape,plot_pop_fst.head()

In [0]:
plt.hist(plot_pop_fst, bins = [x for x in np.linspace(-0.11,0.11,30)])[2]

In [0]:
dfs = [pop_total_fst,plot_total_fst,plot_pop_fst]
ds = ["pop_total","plot_total","plot_pop"]
for i,d in enumerate(dfs):
    d = pd.DataFrame(d)
    d.columns = [ds[i]]
    if i == 0:
        df = pd.DataFrame(d)
    else:
        df = pd.merge(df,d,left_index=True,right_index=True)
df.head()

In [0]:
locDict = OrderedDict()
j = 0
for i,SNP in enumerate(hierfdf.columns[2:]):
    l = len([x for x in hierfdf.loc[:,SNP] if x > 0])
    pthere = l/len(hierfdf.index)
    locDict[SNP] = pthere
    j+=1
    if j % 1000 == 0:
        print j

In [0]:
perc = pd.DataFrame()
perc[0] = locDict.values()
perc.index = locDict.keys()
perc.columns = ['perc data']
perc.head()

In [0]:
perc.shape

In [0]:
df = pd.merge(df,perc,left_index=True,right_index=True)
df.head()

In [0]:
df.head()

In [0]:
filE = '/home/lindb/wbp/hierfstat/missing/missing_hierarchical_Fstats.txt'
df.to_csv(filE, header=True,index=True,sep="\t")

In [0]:
min(df['perc data']),max(df['perc data'])