In [0]:
from __future__ import division
import os
from collections import OrderedDict,Counter
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as mtick
%matplotlib inline
import math
from scipy.stats import ks_2samp
from scipy.stats import anderson_ksamp
import os.path as op
import os.listdir as ls
from scipy.stats import spearmanr
import skbio
from scipy.stats import pearsonr

# make SNPmat

In [0]:
#get the hierftrans for all SNPS, created above as infile for multilocus FST
#z12 file created in 06_pca.ipyn
filE = '/home/lindb/wbp/OutFLANK/imputed_z12_maf_swp_trans_z12.txt'
imp012 = pd.read_csv(filE,header=0,index_col=0,sep="\t")
imp012.head()

In [0]:
#get pop assignment for each samp
filE = '/home/lindb/wbp/sampsTOpop.txt'
stp = pd.read_csv(filE,header=0,index_col='sampID',sep="\t")
stp.head()

In [0]:
merged = pd.merge(imp012,stp,left_index=True,right_index=True)
cols = ['pop'] + [col for col in merged.columns if 'NODE' in col]
merged = merged[cols]
merged.sort_index(inplace=True)
merged.head()

In [0]:
merged.shape

In [0]:
#make sure there aren't any weird data in the dataframe - like 'NAs' or 'np.nan'
for col in merged.columns:
    uni = np.unique(merged[col].tolist()).tolist()
    if (uni == [0,1]) or (uni == [0,1,2]) or (uni == [1,2]) or (uni == [0,2]):
        1+1
    else:
        print col, np.unique(merged[col].tolist()).tolist() #should only print the pop column

In [0]:
cols = [col for col in merged.columns if 'NODE' in col]
snpmat = merged[cols]
snpmat.head()

In [0]:
pops = pd.DataFrame(merged['pop'].tolist())
pops.head()

In [0]:
filE = '/home/lindb/wbp/OutFLANK/SNPmat_HEADERIDX.txt'
filE2 = '/home/lindb/wbp/OutFLANK/SNPmat_noHEADERIDX.txt'
print 'making 1st snpmat' #so I don't have to watch ls -lt
snpmat.to_csv(filE,header=True,index=True,sep="\t")
print 'making 2nd snpmat'
snpmat.to_csv(filE2,header=None,index=False,sep="\t")

popfile = '/home/lindb/wbp/OutFLANK/SNPmat_popNames.txt'
print 'making popfile'
pops.to_csv(popfile,header=False,index=False,sep="\t")

print 'making locfile'
locfile = '/home/lindb/wbp/OutFLANK/SNPmat_locusNames.txt'
cols = pd.DataFrame(snpmat.columns)
cols.to_csv(locfile,header=False,index=False,sep="\t")

# Put into R

```R

library(OutFLANK)
library(data.table)

SNPmat = data.frame(fread('/home/lindb/wbp/OutFLANK/SNPmat_noHEADERIDX.txt',header=F,sep="\t"))

locusNames = read.csv('/home/lindb/wbp/OutFLANK/SNPmat_locusNames.txt',header=F,sep="\t")

popNames = read.csv('/home/lindb/wbp/OutFLANK/SNPmat_popNames.txt',header=F,sep="\t")

FstDataFrame = MakeDiploidFSTMat(SNPmat,locusNames,popNames)

out = OutFLANK(FstDataFrame = FstDataFrame,NumberOfSamples = 8)

df = out$results

outliers = df[which(df$OutlierFlag == 'TRUE'),]

loci = outliers$LocusName

write.table(df,'/home/lindb/wbp/OutFLANK/OutFLANK_results.txt',row.names=F,col.names=T,sep='\t')

write.table(loci,'/home/lindb/wbp/OutFLANK/OutFLANK_snps.txt',row.names=F,sep='\t')

print("DONE!")



```

In [0]:
snps = pd.read_csv('/home/lindb/wbp/OutFLANK/OutFLANK_snps.txt',header=0,sep="\t")
snps.head()

In [0]:
res = pd.read_csv('/home/lindb/wbp/OutFLANK/OutFLANK_results.txt',header=0,sep='\t')
res.head()

In [0]:
fstnocorr = res['FSTNoCorr'].tolist()
min(fstnocorr)

In [0]:
len(snps.index)

# are outliers caused by effects from missing data?

In [0]:
#I want 'missing' because the 'imputed' will all have perc data (%missing data at a snp) == 1.0
#this file was made in 07_hierfstat_missing.ipynb
percdata = pd.read_csv('/home/lindb/wbp/hierfstat/missing/missing_hierarchical_Fstats.txt',header=0,index_col=0,sep="\t")
percdata = pd.DataFrame(percdata['perc data'])
percdata.head()

In [0]:
#use as index to merge
snps.index = [x for x in snps['x']]
snps.head()

In [0]:
merged = pd.merge(snps,percdata,left_index=True,right_index=True)
merged.head()

In [0]:
len(merged.index)

In [0]:
plt.hist(merged['perc data'])

In [0]:
plt.hist(percdata['perc data'])

In [0]:
results = pd.read_csv('/home/lindb/wbp/OutFLANK/OutFLANK_results.txt',header=0,index_col='LocusName',sep="\t")
results.head()

In [0]:
np.unique(results['OutlierFlag'])

In [0]:
trues = results[results[u'OutlierFlag'] == True]
len(trues.index)

In [0]:
trues.head()

In [0]:
merged2 = pd.merge(merged,pd.DataFrame(trues[['FST','FSTNoCorr']]),left_index=True,right_index=True)

In [0]:
merged2.head()

In [0]:
plt.scatter(merged2['perc data'],merged2['FSTNoCorr'])

In [0]:
plt.scatter(merged2['perc data'],merged2['FST'])

In [0]:
merged2['FSTNoCorr'].describe()

In [0]:
len(merged2.index)

In [0]:
#get multilocus f stats for the imputed outlier snps
#this file was made in 07_hierfstat_imputed.ipynb
impfstats = pd.read_csv('/home/lindb/wbp/hierfstat/imputed/imputed_hierarchical_Fstats.txt',header=0,index_col=0,sep="\t")
impfstats = impfstats.loc[:,[col for col in impfstats.columns if not 'perc' in col]]
impfstats.head()

In [0]:
#get the important columns
allstats = pd.merge(impfstats,merged2.loc[:,[col for col in merged2.columns if not 'x' in col]],
                    left_index=True,right_index=True)
allstats.head()

In [0]:
#rename so I don't have to search my script to find what they mean or where they come from
cols = ['pop_total_hierfstat', 'plot_total_hierfstat', 'plot_pop_hierfstat', 'perc missing genotypes', 'FST_outflank', 
        'FSTNoCorr_outflank']
allstats.columns = [col for col in cols]
allstats.head()

In [0]:
#write to file
filE = '/home/lindb/wbp/OutFLANK/hierarchical_Fstats_outflankoutliers.txt'
allstats.to_csv(filE,header=True,index=True,sep='\t')

In [0]:
allstats.head()

# covariances using H_exp

In [0]:
#imputed_z12_maf_swp_trans_z12.txt was made in 06_pca.ipynb and is symlinked in /OutFLANK
filE = '/home/lindb/wbp/OutFLANK/imputed_z12_maf_swp_trans_z12.txt'
imp012 = pd.read_csv(filE,header=0,index_col=0,sep="\t")
imp012.head()

In [0]:
imp012.shape

In [0]:
#get expected heterozygosity
Hexp = OrderedDict()
count = 0
for snp in imp012.columns:
    zero = imp012[snp].tolist().count(0)
    one  = imp012[snp].tolist().count(1)
    two  = imp012[snp].tolist().count(2)
    
    p = ((2*zero)+one)/(2*(zero+one+two))
    q = ((2*two)+one)/(2*(zero+one+two))
    
    Hexp[snp] = 2*p*q
    count += 1
    if count % 10000 == 0:
        print count
len(Hexp.keys())

In [0]:
filE = '/home/lindb/wbp/OutFLANK/Hexp_by_snp.txt'
with open(filE,'w') as o:
    text = 'locus\tH_exp\n'
    o.write("%s" % text)
    for snp in Hexp.keys():
        text = '\t'.join([snp,str(Hexp[snp])])+'\n'
        o.write("%s" % text)

In [0]:
filE = '/home/lindb/wbp/OutFLANK/Hexp_by_snp.txt'
H = pd.read_csv(filE,header=0,sep='\t')
H.index = [snp for snp in H['locus'].tolist()]
H.head()

In [0]:
plt.hist(H['h_exp'],bins = [x for x in np.arange(0,.51,0.01)])[2]

In [0]:
h

In [0]:
len(np.arange(0,0.51,0.01))

In [0]:
#see what will happen
#1st bin is the j=0th bin, 50th bin is the j=49th bin
for Bin,j in enumerate(np.arange(0,.51,.01)):
    #print Bin,j
    if 0.50>j: #0.50 don't need their own bin
        print "heyo",Bin,j

In [0]:
np.round(H.loc[1043,'h_exp'],decimals=4)

In [0]:
#assign bins to samps
#1st bin is the j=0th bin, 50th bin is the j=49th bin
count = 0
binDict = OrderedDict()
for row in H.index:
    h = np.round(H.loc[row,'h_exp'],decimals=3)
    binDict[row] = 0 #because 2pq will never be equal to 0 for a SNP, but may be less than 0.01
    for Bin,j in enumerate(np.arange(0,0.51,0.01)): #1st bin is the j=0th bin, 50th bin is the j=49th bin
        if h>j: #binDict[row] will constantly replace the value, which is good. don't want 2pq=0.5 having its own group
            binDict[row] = Bin
    count += 1
    if count % 1000 == 0:
        print count

In [0]:
len(np.unique(binDict.values()).tolist()) # how many bins

In [0]:
binDict.values()[:10],binDict.values()[-10:]

In [0]:
H['bin'] = binDict.values()

In [0]:
H.head()

In [0]:
np.unique(H['bin'])

In [0]:
#write the file
filE = '/home/lindb/wbp/OutFLANK/Hexp_by_snp_withbins.txt'
H.to_csv(filE,header=True,index=True,sep='\t')

In [0]:
#read back the file
filE = '/home/lindb/wbp/OutFLANK/Hexp_by_snp_withbins.txt'
H = pd.read_csv(filE, header=0,index_col=0,sep='\t')

In [0]:
from matplotlib.backends.backend_pdf import PdfPages

In [0]:
H.head()

In [0]:
with PdfPages('/home/lindb/wbp/OutFLANK/figures/expected_heterozygosit_all_SNPs.pdf') as pdf:
    fig = plt.figure()
    plt.hist(H['h_exp'],bins = [x for x in np.arange(0,0.51,0.01)])[2]
    plt.xlabel('Expected Heterozygosity')
    plt.ylabel('Count')
    #set_size_inches(5,5)
    pdf.savefig(fig,bbox_inches='tight')

In [0]:
with PdfPages('/home/lindb/wbp/OutFLANK/figures/expected_heterozygosit_all_SNPs.pdf') as pdf:
    fig = plt.figure()
    plt.hist(H['bin'],bins = [x for x in range(51)])[2]
    plt.xlabel('Expected Heterozygosity')
    plt.ylabel('Count')
    #set_size_inches(5,5)
    pdf.savefig(fig,bbox_inches='tight')

In [0]:
len(H.index)

# make minor allele freq dataframe

In [0]:
#get allele counts by pop - first locus = counts of 0 allele, second = counts of 2 allele
    #012 counts global minor allele
counts = pd.read_csv('/home/lindb/wbp/bayenv2/UnbinnedImputedSNPSFILE.txt',header=0,index_col=0,sep="\t")
counts.head()

In [0]:
#first row + second row of DataFrame(counts)
43+91+43+42+87+42+41+40+7+5+7+8+11+6+7+8

In [0]:
df = pd.DataFrame(counts.loc['NODE_1000013_length_91_cov_1.802198_37',:])
df.index = ['major','minor']
df

In [0]:
sum(df['Dicks_Pass'])

In [0]:
df.loc['minor','Dicks_Pass']/sum(df['Dicks_Pass'])

In [0]:
#(my notebook is on a slow node)
#make minor allele freq dataframe
text = '''from __future__ import division
import os
from collections import OrderedDict,Counter
import pandas as pd
import numpy as np
import random

f2 = '/home/lindb/wbp/OutFLANK/update_maf.txt'
with open(f2,'w') as o:
    text = 'starting\\n'
    o.write("%s" % text)

counts = pd.read_csv('/home/lindb/wbp/bayenv2/UnbinnedImputedSNPSFILE.txt',header=0,index_col=0,sep="\\t")
loci = np.unique(counts.index).tolist()

loccount = 0
mafDict = OrderedDict()
for locus in loci:
    mafDict[locus] = OrderedDict()
    data = pd.DataFrame(counts.loc[locus,:])
    data.index = ['major','minor']
    for pop in data.columns:
        MAF = data.loc['minor',pop]/sum(data[pop]) #get allele freq corresponding to global minor allele
        mafDict[locus][pop] = MAF
    loccount += 1
    if loccount % 1000 == 0:
        with open(f2,'a') as o:
            text = "%s" % str(loccount)
            o.write("%s\\n" % text)
        print loccount

with open(f2,'a') as o:
    text = 'writing file\\n'
    o.write("%s" % text)

filE = '/home/lindb/wbp/OutFLANK/imputed_MAF.txt'
with open(filE,'w') as o:
    text = '\\t'.join([x for x in mafDict[mafDict.keys()[0]].keys()]) + '\\n'
    o.write("%s" % text)
    print text
    count = 0
    for locus in mafDict.keys():
        text = locus + '\\t' + '\\t'.join([str(x) for x in mafDict[locus].values()]) + '\\n'
        o.write("%s" % text)
        count += 1
'''

In [0]:
filE = '/home/lindb/wbp/OutFLANK/maf.py'
with open(filE,'w') as o:
    o.write("%s" % text)

In [0]:
shtext = '''#!/bin/bash
#$ -N maf
#$ -V
#$ -j y
#$ -cwd

cd /home/lindb/wbp/OutFLANK/
python maf.py

'''
filE = '/home/lindb/wbp/OutFLANK/get_maf.sh'
with open(filE,'w') as o:
    o.write("%s" % shtext)

In [0]:
!qsub /home/lindb/wbp/OutFLANK/get_maf.sh

In [0]:
outliersnps = snps['x'].tolist()
len(outliersnps)

In [0]:
outliersnps

In [0]:
impMAF = pd.read_csv('/home/lindb/wbp/OutFLANK/imputed_MAF.txt',header=0,index_col=0,sep="\t")
impMAF.head()

In [0]:
filE= '/home/lindb/wbp/OutFLANK/imputed_MAF.txt'
impMAF.to_csv(filE,header=True,index=True,sep='\t')

In [0]:
impMAF.head()

In [0]:
impMAF.shape

In [0]:
for pop in impMAF.columns:
    print pop,min(impMAF[pop]),max(impMAF[pop])

# get global maf allele freqs

In [0]:
glob = OrderedDict() 
count = 0
for snp in imp012.columns:
    zero = imp012[snp].tolist().count(0)
    one  = imp012[snp].tolist().count(1)
    two  = imp012[snp].tolist().count(2)
    
    a1 = ((2*zero)+one)/(2*(zero+one+two))
    a2 = ((2*two)+one)/(2*(zero+one+two))
    
    q = min(a1,a2)
    
    glob[snp] = q
    
    count +=1 
    if count % 10000 == 0:
        print count

In [0]:
len(glob.keys())

In [0]:
filE = '/home/lindb/wbp/OutFLANK/global_mafs.txt'
with open(filE,'w') as o:
    text = '\t'.join(['locus','maf'])+'\n'
    o.write("%s" % text)
    for snp in glob.keys():
        text = '\t'.join([snp,str(glob[snp])])+'\n'
        o.write("%s" % text)
globmafs = pd.read_csv(filE,header=0,sep='\t')
globmafs.head()

In [0]:
H.head()

In [0]:
#make sure global allele freq and H_exp make sense
H.loc['NODE_1000031_length_98_cov_2.000000_30','h_exp']

In [0]:
2*0.151639*(1-0.151639)

In [0]:
len(globmafs.index)

# get pop sizes

In [0]:
filE = '/home/lindb/wbp/sampsTOpop.txt'
stp = pd.read_csv(filE,header=0,sep="\t")
stp.head()

In [0]:
#pops matched to samps
ptsDict = OrderedDict() #pop to samp dictionary
for row in stp.index:
    pop = stp.loc[row,'pop']
    if not pop in ptsDict.keys():
        ptsDict[pop] = []
    ptsDict[pop].append(stp.loc[row,'sampID'])
for pop in ptsDict.keys():
    print pop,len(ptsDict[pop])

In [0]:
#get a dict with num indiv in pop
popDict = OrderedDict()
total = 0
for pop in ptsDict.keys():
    popDict[pop] = len(ptsDict[pop])
    print pop,popDict[pop]
    total += popDict[pop]
total

# covariances using H_exp

In [0]:
snps = pd.read_csv('/home/lindb/wbp/OutFLANK/OutFLANK_snps.txt',header=0,sep='\t')
outliersnps = snps['x'].tolist()

In [0]:
filE = '/home/lindb/wbp/OutFLANK/OutFLANK_snps.txt'
snps.to_csv(filE,header=True,index=False,sep='\t')

In [0]:
len(outliersnps)

    D_ij = sum { (nk/n)*((qik*qjk)-(qi*qj)) } for 1:k pops
    qik = snp i maf for pop k
    qik = snp j maf for pop k
    
    qi = global maf
    qj = global maf

In [0]:
#do pairwise to get D
dijDict = OrderedDict() 
icount = 0
for i,locusi in enumerate(outliersnps):
    dijDict[locusi] = OrderedDict()
    qi = glob[locusi] #global maf
    
    for j,locusj in enumerate(outliersnps):
        if i > j: #i=row, j=col : lower triangle 
            qj = glob[locusj] #global maf
            
            sums = 0
            for pop in impMAF.columns:
                qik = impMAF.loc[locusi,pop] #get pop maf
                qjk = impMAF.loc[locusj,pop] #get pop maf
                nk = popDict[pop]
                
                sums += (nk/244)*((qik*qjk)-(qi*qj))

            dijDict[locusi][locusj] = sums
        else:
            dijDict[locusi][locusj] = np.nan
    icount += 1
    if icount % 10 == 0:
        print icount

In [0]:
#write out the file
rowcount = 0
filE = '/home/lindb/wbp/OutFLANK/covariances/dvals/imputed_dvals.txt'
if not op.exists(op.dirname(filE)):
    os.makedirs(op.dirname(filE))
with open(filE,'w') as o:
    key0 = dijDict.keys()[0]
    line = '\t'.join(dijDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for locusi in dijDict.keys():
        line = str(locusi)+'\t'+'\t'.join([str(x) for x in dijDict[locusi].values()]) + str('\n')
        o.write("%s" % line)

In [0]:
dvals = pd.read_csv(filE,header=0,index_col=0,sep="\t")
dvals.head()

In [0]:
#get a dataframe with the outlier loci and their bins
outlierdata = pd.DataFrame(H[H['locus'].isin(outliersnps)])
outlierdata.index = [snp for snp in outlierdata['locus'].tolist()]
outlierdata.head()

In [0]:
with PdfPages('/home/lindb/wbp/OutFLANK/figures/expected_heterozygosit_outlier_SNPs.pdf') as pdf:
    fig = plt.figure()
    plt.hist(outlierdata['h_exp'].tolist(),bins = [binn for binn in np.arange(0,0.51,0.01)])[2]
    plt.xlabel('Expected Heterozygosity')
    plt.ylabel('Count')
    #set_size_inches(5,5)
    pdf.savefig(fig,bbox_inches='tight')

In [0]:
plt.hist(outlierdata['h_exp'].tolist(),bins = [binn for binn in np.arange(0,0.51,0.01)])[2]

In [0]:
nonsigs = set(H.index.tolist()) - set(outlierdata.index.tolist())
nonsigs = [x for x in nonsigs]
len(nonsigs)

In [0]:
nonsigdata = pd.DataFrame(H[H['locus'].isin(nonsigs)])
nonsigdata.shape

In [0]:
#how many random snps from each bin?
binCounter = Counter()
for row in outlierdata.index:
    binCounter[outlierdata.loc[row,'bin']] += 1
for b in binCounter.keys():
    print b,binCounter[b]

In [0]:
#make 1000 dataframes with a set of snps == 110 = len(outliersnps)
for i in range(20):                                  #make 20 .py files
    for j in range(50):                              #each .py file makes 50 matrices
        snps = []        
        for binn in binCounter.keys():
            data = nonsigdata[nonsigdata['bin'] == binn]
            
            [snps.append(snp) for snp in random.sample(data.index,binCounter[binn])]
        
        print len(snps)
        DIR = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/randsnps'
        if not op.exists(DIR):
            os.makedirs(DIR)
        filE = op.join(DIR,"outflank_%s_%s_randsnps.txt" % (str(i).zfill(2),str(j).zfill(2)))
        df = pd.DataFrame(snps)
        df.to_csv(filE,header=False,index=False,sep="\t")

In [0]:
stp.head()

In [0]:
#I'm using all of my engines for bayenv2 at the moment, ran these with GNU parallel
#get dvals for 1000 sets of random snps of len=110
for k in range(20):
    for l in range(50):
        text = '''from __future__ import division
import os
from collections import OrderedDict,Counter
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import math
from scipy.stats import ks_2samp
from scipy.stats import anderson_ksamp
from os import path as op
from os import listdir as ls

#get pop assignment for each samp
filE = '/home/lindb/wbp/sampsTOpop.txt'
stp = pd.read_csv(filE,header=0,sep="\t")


#pops matched to samps
ptsDict = OrderedDict() #pop to samp dictionary
for row in stp.index:
    pop = stp.loc[row,'pop']
    if not pop in ptsDict.keys():
        ptsDict[pop] = []
    ptsDict[pop].append(stp.loc[row,'sampID'])


#get a dict with num indiv in pop
popDict = OrderedDict()
total = 0
for pop in ptsDict.keys():
    popDict[pop] = len(ptsDict[pop])
    print pop,popDict[pop]
    total += popDict[pop]


filE = '/home/lindb/wbp/OutFLANK/global_mafs.txt'
globs = pd.read_csv(filE,header=0,sep='\\t')
glob = OrderedDict()
for row in globs.index:
    snp = globs.loc[row,'locus']
    maf = globs.loc[row,'maf']
    glob[snp] = maf

impMAF = pd.read_csv('/home/lindb/wbp/OutFLANK/imputed_MAF.txt',header=0,index_col=0,sep="\\t")

filE= '/home/lindb/wbp/OutFLANK/covariances/randmatrices/randsnps/outflank_%s_%s_randsnps.txt' 
df = pd.read_csv(filE,header=None,sep="\\t")
randomsnps = df[0].tolist()

dijDict = OrderedDict() 
for i,locusi in enumerate(randomsnps):
    dijDict[locusi] = OrderedDict()
    qi = glob[locusi] #global maf

    for j,locusj in enumerate(randomsnps):
        if i > j: #i=row, j=col : lower triangle 
            qj = glob[locusj] #global maf

            sums = 0
            for pop in impMAF.columns:
                qik = impMAF.loc[locusi,pop] #get pop maf
                qjk = impMAF.loc[locusj,pop] #get pop maf
                nk = popDict[pop]

                sums += (nk/sum(popDict.values()))*((qik*qjk)-(qi*qj))

            dijDict[locusi][locusj] = sums
        else:
            dijDict[locusi][locusj] = np.nan

filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0outfiles/outflank_%s_%s_imputedDVALS.txt'
DIR = opdirname(filE)
if not opexists(DIR):
    os.makedirs(DIR)

with open(filE,'w') as o:
    key0 = dijDict.keys()[0]
    line = '\\t'.join(dijDict[key0].keys()) + str('\\n')
    o.write("%%s" %% line)
    for locusi in dijDict.keys():
        line = str(locusi)+'\\t'+'\\t'.join([str(x) for x in dijDict[locusi].values()]) + str('\\n')
        o.write("%%s" %% line)


''' % (str(k).zfill(2),str(l).zfill(2),
       str(k).zfill(2),str(l).zfill(2))
        filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0pyfiles/get_rand_dvals_%s_%s.py' % (str(k).zfill(2),
                                                                                                       str(l).zfill(2)
                                                                                                      )
        with open(filE,'w') as o:
            o.write("%s" % text)

In [0]:
#make qsub files
count = 0
shcount = 0
for i in range(20):
    for j in range(50):
        filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0pyfiles/get_rand_dvals_%s_%s.py' % (str(i).zfill(2),
                                                                                                       str(j).zfill(2)
                                                                                                      )
        if count == 0:
            text = '''#!/bin/bash
#$ -N snpsfile
#$ -V
#$ -j y
#$ -cwd

python %s
''' % filE
        else:
            newtext = '''
python %s
''' % filE
            text = text + newtext
        count += 1
        if count == 40:
            count = 0
            filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0runfiles/%s_run.sh' % str(shcount).zfill(2)
            shcount += 1
            if not op.exists(op.dirname(filE)):
                os.makedirs(op.dirname(filE))
            with open(filE,'w') as o:
                o.write("%s" % text)

### place median observed dij in distribution of median dij for random snps

In [0]:
# get observed dvals
DF = pd.read_csv('/home/lindb/wbp/OutFLANK/covariances/dvals/imputed_dvals.txt',header=0,index_col=0,sep="\t")
dvals = []
for i,row in enumerate(DF.index):
    for j,col in enumerate(DF.columns):
        if i > j:
            dvals.append(abs(DF.loc[row,col]))
            
DIR = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0outfiles'
files = ls(DIR)
files = [f for f in files if f.startswith('outflank')]

fcount = 0
medvals = []
for f in files:
    df = pd.read_csv(opjoin(DIR,f),header=0,index_col=0,sep="\t")
    rvals = [] #random dij values
    for i,row in enumerate(df.index):
        for j,col in enumerate(df.columns):
            if i>j:
                rvals.append(df.loc[row,col])

    
    medvals.append(np.median([abs(x) for x in rvals]))

    fcount += 1
    if fcount % 100 == 0:
        print fcount

filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0randmedvals/outflank_imputed_randmedvalues.txt'
if not op.exists(op.dirname(filE)):
    os.makedirs(op.dirname(filE))
medvals = pd.DataFrame(medvals)
medvals.to_csv(filE,header=None,index=False,sep="\t")

In [0]:
1+1

In [0]:
medDF = pd.read_csv(filE,header=None,sep='\t')
medDF.head()

In [0]:
sorts = sorted(medvals[0].tolist())
n5th = sorts[949] #95th percentile

In [0]:
#this is the distribution of median values of random SNPs
#red line is the 95th percentile (95th = 0.0006213)
plt.hist(sorts)[2] 
plt.axvline(x=n5th,c="red",linewidth=2,zorder=0) #should be zorder=1 

In [0]:
#this is the distribution of observed Dvals
#red line is the median value (median = 0.0043892)
med = np.median(dvals)
fig = plt.hist(dvals)[2]
plt.axvline(x=med,c="red",linewidth=5,zorder=0)

In [0]:
#how much bigger is the empirical median dvalue than the 100th percentile of random SNPs?
np.median(dvals)/max(sorts)

In [0]:
np.median(dvals)/n5th

In [0]:
len(dvals)

In [0]:
med

In [0]:
#below what percentile of the observed dvals less than the maximum randdvals
for i,medi in enumerate(sorted(dvals)):
    if not medi < max(sorts):
        print i,i/len(dvals)
        break      

In [0]:
#below what percentile of the observed dvals are the values less than the 95th perc randdvals
for i,D in enumerate(sorted(dvals)):
    if not D < sorts[950]: #if the observed D-value isn't less than the 95th percentile of the random distribution of D
        print i,i/len(dvals)
        break

In [0]:
max(sorts),med

In [0]:
n5th,med

# allele frequency shifts

### using median abs Dij

In [0]:
#dataframe to get population MAF across 8 pops using all n=244 samples (can't use this for GEMMA since pop sizes r diff
impMAF.head()

In [0]:
#population sizes
popDict

In [0]:
len(outliersnps)

In [0]:
#do pairwise pops for empirical SNPs IDed by OutFLANK and calculate median Dij for each pop pair
shiftDict = OrderedDict() 
kcount = 0
for m,popm in enumerate(impMAF.columns):
    print popm
    shiftDict[popm] = OrderedDict()
    for l,popl in enumerate(impMAF.columns):
        if m>l: #only need to do the lower triangle
            dijlist = []
            for i,locusi in enumerate(outliersnps):
                for j,locusj in enumerate(outliersnps):
                    if i > j: #i=row, j=col : lower triangle 
                        sums =0
                        kcount += 1
                        for popk in [popm,popl]:
                            qik = impMAF.loc[locusi,popk]        #get locusi maf for pop k
                            qjk = impMAF.loc[locusj,popk]        #get locusj maf for pop k
                            nk = popDict[popk]                   #N  individuals  in pop k
                            
                            globN = 2*(popDict[popm]+popDict[popl]) # number of alleles across 2 pops
                            
                            #get global mafs
                            fqim = impMAF.loc[locusi,popm]        #minor allele locusi freq  in popm
                            nqim = round(fqim*2*popDict[popm])    #minor allele locusi count in popm
                            fqil = impMAF.loc[locusi,popl]        #minor allele locusi freq  in popl
                            nqil = round(fqil*2*popDict[popl])    #minor allele locusi count in popl
                            
                            fqjm = impMAF.loc[locusj,popm]        #minor allele locusj freq  in popm
                            nqjm = round(fqjm*2*popDict[popm])    #minor allele locusj count in popm
                            fqjl = impMAF.loc[locusj,popl]        #minor allele locusj freq  in popl
                            nqjl = round(fqjl*2*popDict[popl])    #minor allele locusj count in popl
                            
                            qi = (nqim+nqil)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusi
                            qj = (nqjm+nqjl)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusj

                            sums += (nk/(popDict[popm]+popDict[popl]))*((qik*qjk)-(qi*qj))

                        dijlist.append(sums) #each pairwise pop comparison has a matrix of Dij
                    else:
                        pass #no redundancies, no diagonal. 
            shiftDict[popm][popl] = np.median([abs(d) for d in dijlist])
        else:
            shiftDict[popm][popl] = np.nan #no redundancies,no diagonal. will be faster to reflect across diag later
#    if kcount > 1:
#        break

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_dij.text'
with open(filE,'w') as o:
    key0 = shiftDict.keys()[0]
    line = '\t'.join(shiftDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for popk in shiftDict.keys():
        text = str(popk)+'\t'+'\t'.join([str(d) for d in shiftDict[popk].values()])+'\n'
        o.write("%s" % text)

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_dij.text'
shiftDF = pd.read_csv(filE,header=0,index_col=0,sep='\t')
shiftDF

In [0]:
shifts = []
for i,row in enumerate(shiftDF.index):
    for j,col in enumerate(shiftDF.columns):
        if i > j:
            shifts.append(shiftDF.loc[row,col])

In [0]:
for i,popi in enumerate(shiftDF.index):
    for j,popj in enumerate(shiftDF.columns):
        if i == j:
            shiftDF.loc[popi,popj] =0
        elif math.isnan(shiftDF.loc[popi,popj]) == True:
            shiftDF.loc[popi,popj] = shiftDF.loc[popj,popi]
shiftDF

In [0]:
filE = '/home/lindb/wbp/bayenv2/distance_matrices/geographic_distances.txt'
geodist = pd.read_csv(filE,header=0,index_col=0,sep='\t')
geodist

In [0]:
geodist.loc['Freel_Peak','Dicks_Pass'][:-3]

In [0]:
geos = []
for i,row in enumerate(geodist.index):
    for j,col in enumerate(geodist.columns):
        if i > j:
            geos.append(float(geodist.loc[row,col]))

In [0]:
#get rid of the km
for i,popi in enumerate(geodist.index):
    for j,popj in enumerate(geodist.columns):
        if i>j:
            geodist.loc[popi,popj] = float(geodist.loc[popi,popj][:-3])
geodist

In [0]:
for i,popi in enumerate(geodist.index):
    for j,popj in enumerate(geodist.columns):
        if i == j:
            geodist.loc[popi,popj] = 0
        elif math.isnan(geodist.loc[popi,popj]) == True:
            geodist.loc[popi,popj] = geodist.loc[popj,popi]
geodist

In [0]:
filE = '/home/lindb/wbp/bayenv2/distance_matrices/geographic_distances.txt'
geodist.to_csv(filE,header=True,index=True,sep='\t')

In [0]:
plt.scatter(shifts,geos)

In [0]:
#Mantel shift vs geodist
skbio.stats.distance.mantel(shiftDF,geodist,permutations=9999)

In [0]:
#file made in 08_bayen2
envdist = pd.read_csv('/home/lindb/wbp/bayenv2/matrices/environmental_distances.txt',header=0,index_col=0,sep='\t')
envdist

In [0]:
envs = []
for i,row in enumerate(envdist.index):
    for j,col in enumerate(envdist.columns):
        if i > j:
            envs.append(float(envdist.loc[row,col]))
envs

In [0]:
plt.scatter(envs,shifts)

In [0]:
for i,popi in enumerate(envdist.index):
    for j,popj in enumerate(envdist.columns):
        if i == j:
            envdist.loc[popi,popj] = 0
        elif math.isnan(envdist.loc[popi,popj]) == True:
            envdist.loc[popi,popj] = envdist.loc[popj,popi]
envdist

In [0]:
filE = '/home/lindb/wbp/bayenv2/distance_matrices/environmental_distances.txt'
envdist.to_csv(filE,header=True,index_col=True,sep='\t')

In [0]:
#file made in 08_bayen2
envdist = pd.read_csv('/home/lindb/wbp/bayenv2/distance_matrices/environmental_distances.txt',header=0,index_col=0,sep='\t')
envdist

In [0]:
#Mantel shift vs total environmental distance
skbio.stats.distance.mantel(shiftDF,envdist,permutations=9999)

In [0]:
envdf = pd.read_csv('/home/lindb/wbp/bayenv2/ENVIRONFILE_headerIDX.txt',header=0,index_col=0,sep='\t')
envdf = envdf.loc[:,[col for col in envdf.columns[:8]]]
envdf.head()

In [0]:
#get distance matrices for each of the environmental variables
envdDict = OrderedDict()
for env in envdf.index:
    envdDict[env] = pd.DataFrame(index=[pop for pop in shiftDF.index],columns=[pop for pop in shiftDF.columns])
    for i,popi in enumerate(envdf.columns):
        for j,popj in enumerate(envdf.columns):
            if i != j:
                dist = abs(envdf.loc[env,popi]-envdf.loc[env,popj])
                envdDict[env].loc[popi,popj] = dist
            elif i == j:
                envdDict[env][popi][popj] = 0

In [0]:
envdDict['Ann-ppt']

In [0]:
os.makedirs('/home/lindb/wbp/distance_matrices/')

In [0]:
for env in envdDict.keys():
    print env
    filE = '/home/lindb/wbp/distance_matrices/%s_dist_symm.txt' % env
    envdDict[env].to_csv(filE,header=True,index=True,sep='\t')

In [0]:
#mantel vs individual envs
for env in envdDict:
    mant = mantel(shiftDF,envdDict[env],permutations = 9999)
    print env,mant

In [0]:
mant

In [0]:
appt = []
for i,popi in enumerate(envdDict['Ann-ppt'].index):
    for j,popj in enumerate(envdDict['Ann-ppt'].columns):
        if i>j:
            appt.append(envdDict['Ann-ppt'].loc[popi,popj])
plt.scatter(shifts,appt)

In [0]:
envdDict.keys()

In [0]:
longs = []
for i,popi in enumerate(envdDict['Lon'].index):
    for j,popj in enumerate(envdDict['Lon'].columns):
        if i>j:
            longs.append(envdDict['Lon'].loc[popi,popj])
plt.scatter(shifts,longs)

In [0]:
pearsonr(appt,shifts)

In [0]:
prmi = []
for i,popi in enumerate(envdDict['Max-rad-input'].index):
    for j,popj in enumerate(envdDict['Max-rad-input'].columns):
        if i>j:
            prmi.append(envdDict['Max-rad-input'].loc[popi,popj])
plt.scatter(prmi,shifts)

In [0]:
sand = []
for i,popi in enumerate(envdDict['Sand'].index):
    for j,popj in enumerate(envdDict['Sand'].columns):
        if i>j:
            sand.append(envdDict['Sand'].loc[popi,popj])
plt.scatter(sand,shifts)

In [0]:
silt = []
for i,popi in enumerate(envdDict['Silt'].index):
    for j,popj in enumerate(envdDict['Silt'].columns):
        if i>j:
            silt.append(envdDict['Silt'].loc[popi,popj])
plt.scatter(silt,shifts)

In [0]:
WC3rdbar = []
for i,popi in enumerate(envdDict['WC3rdbar'].index):
    for j,popj in enumerate(envdDict['WC3rdbar'].columns):
        if i>j:
            WC3rdbar.append(envdDict['WC3rdbar'].loc[popi,popj])
plt.scatter(WC3rdbar,shifts)

In [0]:
#test against phenotypic distances
DIR = '/home/lindb/wbp/gemma/distance_matrices'
files = [op.join(DIR,f) for f in ls(DIR)]
phendf = OrderedDict()
for f in files:
    pheno = op.basename(f).split("_")[0]
    phendf[pheno] = pd.read_csv(f,header=0,index_col=0,sep='\t')
phendf[pheno]

In [0]:
from skbio.stats.distance import mantel

In [0]:
shiftDF

In [0]:
#get a shiftDF with only those pops in common gardens
pops = ['Dicks_Pass','Freel_Peak','Little_Round_Top','Mt_Rose_Ophir','Rifle_Peak','Snow_Valley_Peak']
cgshiftDF = pd.DataFrame(shiftDF[[col for col in shiftDF.columns if col in pops]])
cgshiftDF = cgshiftDF[cgshiftDF.index.isin(pops)]
cgshiftDF

In [0]:
#test against phenotypic distances
for pheno in phendf:
    mant = mantel(cgshiftDF,phendf[pheno],permutations=9999)
    if mant[1] <= 0.05:
        print pheno,mant

### using 95th percentile Dij

In [0]:
#where is the 95th percentile for 110 choose 2 Dij?
math.floor(5995*0.95)

In [0]:
#do pairwise pops for empirical SNPs IDed by OutFLANK and calculate median Dij for each pop pair
n5thshiftDict = OrderedDict() 
kcount = 0
for m,popm in enumerate(impMAF.columns):
    print popm
    n5thshiftDict[popm] = OrderedDict()
    for l,popl in enumerate(impMAF.columns):
        if m>l:
            dijlist = []
            for i,locusi in enumerate(outliersnps):
                for j,locusj in enumerate(outliersnps):
                    if i > j: #i=row, j=col : lower triangle 
                        sums =0
                        kcount += 1
                        for popk in [popm,popl]:
                            qik = impMAF.loc[locusi,popk] #get locusi maf for pop k
                            qjk = impMAF.loc[locusj,popk] #get locusj maf for pop k
                            nk = popDict[popk]            #N individuals in pop k
                            
                            globN = 2*(popDict[popm]+popDict[popl]) # number of alleles across 2 pops
                            
                            #get global mafs
                            fqim = impMAF.loc[locusi,popm]        #minor allele locusi freq  in popm
                            nqim = round(fqim*2*popDict[popm])    #minor allele locusi count in popm
                            fqil = impMAF.loc[locusi,popl]        #minor allele locusi freq  in popl
                            nqil = round(fqil*2*popDict[popl])    #minor allele locusi count in popl
                            
                            fqjm = impMAF.loc[locusj,popm]        #minor allele locusj freq  in popm
                            nqjm = round(fqjm*2*popDict[popm])    #minor allele locusj count in popm
                            fqjl = impMAF.loc[locusj,popl]        #minor allele locusj freq  in popl
                            nqjl = round(fqjl*2*popDict[popl])    #minor allele locusj count in popl
                            
                            qi = (nqim+nqil)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusi
                            qj = (nqjm+nqjl)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusj

                            sums += (nk/(popDict[popm]+popDict[popl]))*((qik*qjk)-(qi*qj))

                        dijlist.append(sums) #each pairwise pop comparison has a matrix of Dij
                    else:
                        pass #no redundancies, no diagonal.
            n5thshiftDict[popm][popl] = sorted([abs(d) for d in dijlist])[5695]
        else:
            n5thshiftDict[popm][popl] = np.nan #no redundancies,no diagonal. will be faster to reflect across diag later
#    if kcount > 1:
#        break

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_95th_dij.text'
with open(filE,'w') as o:
    key0 = n5thshiftDict.keys()[0]
    line = '\t'.join(n5thshiftDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for popk in n5thshiftDict.keys():
        text = str(popk)+'\t'+'\t'.join([str(d) for d in n5thshiftDict[popk].values()])+'\n'
        o.write("%s" % text)

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_95th_dij.text'
n5thshiftDF = pd.read_csv(filE,header=0,index_col=0,sep='\t')
for i,popi in enumerate(n5thshiftDF.index):
    for j,popj in enumerate(n5thshiftDF.columns):
        if i == j:
            n5thshiftDF.loc[popi,popj] = 0
        elif math.isnan(n5thshiftDF.loc[popi,popj]) == True:
            n5thshiftDF.loc[popi,popj] = n5thshiftDF.loc[popj,popi]
n5thshiftDF

In [0]:
for env in envdDict.keys():
    print env,skbio.stats.distance.mantel(n5thshiftDF,envdDict[env],permutations=9999)

### using max abs Dij values instead of median

In [0]:
#do pairwise pops for empirical SNPs IDed by OutFLANK and calculate median Dij for each pop pair
maxshiftDict = OrderedDict() 
kcount = 0
for m,popm in enumerate(impMAF.columns):
    print popm
    maxshiftDict[popm] = OrderedDict()
    for l,popl in enumerate(impMAF.columns):
        if m>l:
            dijlist = []
            for i,locusi in enumerate(outliersnps):
                for j,locusj in enumerate(outliersnps):
                    if i > j: #i=row, j=col : lower triangle 
                        sums =0
                        kcount += 1
                        for popk in [popm,popl]:
                            qik = impMAF.loc[locusi,popk] #get locusi maf for pop k
                            qjk = impMAF.loc[locusj,popk] #get locusj maf for pop k
                            nk = popDict[popk]            #N individuals in pop k
                            
                            globN = 2*(popDict[popm]+popDict[popl]) # number of alleles across 2 pops
                            
                            #get global mafs
                            fqim = impMAF.loc[locusi,popm]        #minor allele locusi freq  in popm
                            nqim = round(fqim*2*popDict[popm])    #minor allele locusi count in popm
                            fqil = impMAF.loc[locusi,popl]        #minor allele locusi freq  in popl
                            nqil = round(fqil*2*popDict[popl])    #minor allele locusi count in popl
                            
                            fqjm = impMAF.loc[locusj,popm]        #minor allele locusj freq  in popm
                            nqjm = round(fqjm*2*popDict[popm])    #minor allele locusj count in popm
                            fqjl = impMAF.loc[locusj,popl]        #minor allele locusj freq  in popl
                            nqjl = round(fqjl*2*popDict[popl])    #minor allele locusj count in popl
                            
                            qi = (nqim+nqil)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusi
                            qj = (nqjm+nqjl)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusj

                            sums += (nk/(popDict[popm]+popDict[popl]))*((qik*qjk)-(qi*qj))

                        dijlist.append(sums) #each pairwise pop comparison has a matrix of Dij
                    else:
                        pass #no redundancies, no diagonal.
            maxshiftDict[popm][popl] = max([abs(d) for d in dijlist])
        else:
            maxshiftDict[popm][popl] = np.nan #no redundancies,no diagonal. will be faster to reflect across diag later
#    if kcount > 1:
#        break

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_max_dij.text'
with open(filE,'w') as o:
    key0 = maxshiftDict.keys()[0]
    line = '\t'.join(maxshiftDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for popk in maxshiftDict.keys():
        text = str(popk)+'\t'+'\t'.join([str(d) for d in maxshiftDict[popk].values()])+'\n'
        o.write("%s" % text)

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_max_dij.text'
maxshiftDF = pd.read_csv(filE,header=0,index_col=0,sep='\t')
for i,popi in enumerate(maxshiftDF.index):
    for j,popj in enumerate(maxshiftDF.columns):
        if i == j:
            maxshiftDF.loc[popi,popj] = 0
        elif math.isnan(maxshiftDF.loc[popi,popj]) == True:
            maxshiftDF.loc[popi,popj] = maxshiftDF.loc[popj,popi]
maxshiftDF

In [0]:
for env in envdDict.keys():
    print env,skbio.stats.distance.mantel(maxshiftDF,envdDict[env],permutations=9999)

In [0]:
maxs = []
for i,popi in enumerate(maxshiftDF.index):
    for j,popj in enumerate(maxshiftDF.columns):
        if i > j:
            maxs.append(maxshiftDF.loc[popi,popj])
len(maxs)

In [0]:
plt.scatter(maxs,appt)

### using raw Dij values instead of absolute values

In [0]:
#do pairwise pops for empirical SNPs IDed by OutFLANK and calculate median Dij for each pop pair
rawshiftDict = OrderedDict() 
kcount = 0
for m,popm in enumerate(impMAF.columns):
    print popm
    rawshiftDict[popm] = OrderedDict()
    for l,popl in enumerate(impMAF.columns):
        if m>l:
            dijlist = []
            for i,locusi in enumerate(outliersnps):
                for j,locusj in enumerate(outliersnps):
                    if i > j: #i=row, j=col : lower triangle 
                        sums =0
                        kcount += 1
                        for popk in [popm,popl]:
                            qik = impMAF.loc[locusi,popk] #get locusi maf for pop k
                            qjk = impMAF.loc[locusj,popk] #get locusj maf for pop k
                            nk = popDict[popk]            #N individuals in pop k
                            
                            globN = 2*(popDict[popm]+popDict[popl]) # number of alleles across 2 pops
                            
                            #get global mafs
                            fqim = impMAF.loc[locusi,popm]        #minor allele locusi freq  in popm
                            nqim = round(fqim*2*popDict[popm])    #minor allele locusi count in popm
                            fqil = impMAF.loc[locusi,popl]        #minor allele locusi freq  in popl
                            nqil = round(fqil*2*popDict[popl])    #minor allele locusi count in popl
                            
                            fqjm = impMAF.loc[locusj,popm]        #minor allele locusj freq  in popm
                            nqjm = round(fqjm*2*popDict[popm])    #minor allele locusj count in popm
                            fqjl = impMAF.loc[locusj,popl]        #minor allele locusj freq  in popl
                            nqjl = round(fqjl*2*popDict[popl])    #minor allele locusj count in popl
                            
                            qi = (nqim+nqil)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusi
                            qj = (nqjm+nqjl)/((2*popDict[popm])+(2*popDict[popl])) #global maf locusj

                            sums += (nk/(popDict[popm]+popDict[popl]))*((qik*qjk)-(qi*qj))

                        dijlist.append(sums) #each pairwise pop comparison has a matrix of Dij
                    else:
                        pass #no redundancies, no diagonal. will be faster to reflect across diagonal later on
            rawshiftDict[popm][popl] = np.median([d for d in dijlist])
        else:
            rawshiftDict[popm][popl] = np.nan #no redundancies,no diagonal. will be faster to reflect across diag later
#    if kcount > 1:
#        break

In [0]:
#write to file
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_raw_median_dij.text'
with open(filE,'w') as o:
    key0 = rawshiftDict.keys()[0]
    line = '\t'.join(rawshiftDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for popk in rawshiftDict.keys():
        text = str(popk)+'\t'+'\t'.join([str(d) for d in rawshiftDict[popk].values()])+'\n'
        o.write("%s" % text)

In [0]:
filE = '/home/lindb/wbp/OutFLANK/freqshifts/pop_pairwise_raw_median_dij.text'
rawshiftDF = pd.read_csv(filE,header=0,index_col=0,sep='\t')
for i,popi in enumerate(rawshiftDF.index):
    for j,popj in enumerate(rawshiftDF.columns):
        if i == j:
            rawshiftDF.loc[popi,popj] = 0
        elif math.isnan(rawshiftDF.loc[popi,popj]) == True:
            rawshiftDF.loc[popi,popj] = rawshiftDF.loc[popj,popi]
rawshiftDF

In [0]:
for env in envdDict.keys():
    print env,skbio.stats.distance.mantel(rawshiftDF,envdDict[env],permutations=9999)

In [0]:
#below scripts aren't used

# visualizing allele frequency shifts

In [0]:
maxx = -1000
for col in shiftDF.columns:
    lst = [x for x in mirshiftDF[col].tolist() if math.isnan(x)==False]
    if len(lst) >0:
        m = max(lst)
        if m > maxx:
            maxx = m
maxx

In [0]:
#same geographic arrangement around lake tahoe
pops = ['Heavenly',
        'Freel_Peak',
        'Little_Round_Top',
        'Dicks_Pass',
        'West_Shore_Peaks',
        'Rifle_Peak',
        'Mt_Rose_Ophir',
        'Snow_Valley_Peak']

In [0]:
lst = []
comps = []
for i,popi in enumerate(pops):
    for j,popj in enumerate(pops):
        if i > j:
            lst.append((i,j))
            comps.append((popi,popj))
lst

In [0]:
comps

In [0]:
g = Graph(lst)
g.vs["name"] = pops
g.vs["label"] = g.vs["name"]
layout = g.layout_circle()

In [0]:
comps[0]

In [0]:
style = {}
#style["edge_width"] = [(4*(shiftDF.loc[comps[i][0],comps[i][1]]/maxx)) for i in range(len(comps))]
style["edge_width"] = [100*(mirshiftDF.loc[comps[i][0],comps[i][1]]) for i in range(len(comps))]
style["layout"] = layout
plot(g,**style)

In [0]:
shiftDict.keys()

In [0]:
shiftDF.index

In [0]:
graph = {
    'nodes': {
        'Dicks_Pass': {},
        'Freel_Peak': {},
        'Heavenly': {},
        'Little_Round_Top': {},
        'Mt_Rose_Ophir': {},
        'Rifle_Peak': {},
        'Snow_Valley_Peak': {},
        'West_Shore_Peaks': {},
    },
    'edges': [
        {'source': 'Dicks_Pass', 'target': 'Freel_Peak', 'size': shiftDF.loc['Freel_Peak','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'Heavenly', 'size': shiftDF.loc['Heavenly','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'Little_Round_Top', 'size': shiftDF.loc['Little_Round_Top','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'Mt_Rose_Ophir', 'size': shiftDF.loc['Little_Round_Top','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'Rifle_Peak', 'size': shiftDF.loc['Rifle_Peak','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Dicks_Pass']/maxx},
        {'source': 'Dicks_Pass', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Dicks_Pass']/maxx},
        {'source': 'Freel_Peak', 'target': 'Heavenly', 'size': shiftDF.loc['Heavenly','Freel_Peak']/maxx},
        {'source': 'Freel_Peak', 'target': 'Little_Round_Top', 'size': shiftDF.loc['Little_Round_Top','Freel_Peak']/maxx},
        {'source': 'Freel_Peak', 'target': 'Mt_Rose_Ophir', 'size': shiftDF.loc['Mt_Rose_Ophir','Freel_Peak']/maxx},
        {'source': 'Freel_Peak', 'target': 'Rifle_Peak', 'size': shiftDF.loc['Rifle_Peak','Freel_Peak']/maxx},
        {'source': 'Freel_Peak', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Freel_Peak']/maxx},
        {'source': 'Freel_Peak', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Freel_Peak']/maxx},
        {'source': 'Heavenly', 'target': 'Little_Round_Top', 'size': shiftDF.loc['Little_Round_Top','Heavenly']/maxx},
        {'source': 'Heavenly', 'target': 'Mt_Rose_Ophir', 'size': shiftDF.loc['Mt_Rose_Ophir','Heavenly']/maxx},
        {'source': 'Heavenly', 'target': 'Rifle_Peak', 'size': shiftDF.loc['Rifle_Peak','Heavenly']/maxx},
        {'source': 'Heavenly', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Heavenly']/maxx},
        {'source': 'Heavenly', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Heavenly']/maxx},
        {'source': 'Little_Round_Top', 'target': 'Mt_Rose_Ophir', 'size': shiftDF.loc['Mt_Rose_Ophir','Little_Round_Top']/maxx},
        {'source': 'Little_Round_Top', 'target': 'Rifle_Peak', 'size': shiftDF.loc['Rifle_Peak','Little_Round_Top']/maxx},
        {'source': 'Little_Round_Top', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Little_Round_Top']/maxx},
        {'source': 'Little_Round_Top', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Little_Round_Top']/maxx},
        {'source': 'Mt_Rose_Ophir', 'target': 'Rifle_Peak', 'size': shiftDF.loc['Rifle_Peak','Mt_Rose_Ophir']/maxx},
        {'source': 'Mt_Rose_Ophir', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Mt_Rose_Ophir']/maxx},
        {'source': 'Mt_Rose_Ophir', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Mt_Rose_Ophir']/maxx},
        {'source': 'Rifle_Peak', 'target': 'Snow_Valley_Peak', 'size': shiftDF.loc['Snow_Valley_Peak','Rifle_Peak']/maxx},
        {'source': 'Rifle_Peak', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Rifle_Peak']/maxx},
        {'source': 'Snow_Valley_Peak', 'target': 'West_Shore_Peaks', 'size': shiftDF.loc['West_Shore_Peaks','Snow_Valley_Peak']/maxx},
        
    ]
}

jgraph.draw(graph)

In [0]:
layout = graph.layout("circle")
plot(graph,layout=layout)

In [0]:
?jgraph

# old way of calculating covariance using random snps

In [0]:
#this section was done before using H_exp (above)
#the folders here were moved from /covariances to /covariances_unweighted_allele_freqs

In [0]:
#do pairwise to get D
dDict = OrderedDict()
icount = 0
for i,locusi in enumerate(outliersnps):
    dDict[locusi] = OrderedDict()
    for j,locusj in enumerate(outliersnps):
        if i > j: #i=row, j=col : lower triangle
            asums = 0
            bsums = 0
            csums = 0
            for pop in impMAF.columns:
                p_i = impMAF.loc[locusi,pop]
                p_j = impMAF.loc[locusj,pop]

                #calc "a"
                product = p_i*p_j
                asums = asums + product

                #calc "b"
                bsums = bsums + p_i

                #calc "c" 
                csums = csums + p_j

            a = asums/len(impMAF.columns)
            b = bsums/len(impMAF.columns)
            c = csums/len(impMAF.columns)

            d = a - (b*c)
            dDict[locusi][locusj] = d
        else:
            dDict[locusi][locusj] = np.nan
    icount += 1
    if icount % 10 == 0:
        print icount

In [0]:
#write out the file
rowcount = 0
filE = '/home/lindb/wbp/OutFLANK/covariances/dvals/imputed_dvals.txt'
if not opexists(opdirname(filE)):
    os.makedirs(opdirname(filE))
with open(filE,'w') as o:
    key0 = dDict.keys()[0]
    line = '\t'.join(dDict[key0].keys()) + str('\n')
    o.write("%s" % line)
    for locusi in dDict.keys():
        line = str(locusi)+'\t'+'\t'.join([str(x) for x in dDict[locusi].values()]) + str('\n')
        o.write("%s" % line)

In [0]:
dvals = pd.read_csv(filE,header=0,index_col=0,sep="\t")
dvals.head()

In [0]:
len(loci)

In [0]:
#get a list of snps not IDed as outliers
bucket = set(loci) - set(outliersnps)
len(bucket)

In [0]:
#get a list of snps not IDed as outliers
filE = '/home/lindb/wbp/OutFLANK/covariances/drawbuckets/outflank_bucket.txt'
snpbucket = pd.DataFrame([x for x in bucket])
if not opexists(opdirname(filE)):
    os.makedirs(opdirname(filE))
snpbucket.to_csv(filE,header=True,index=True,sep="\t")

In [0]:
snpbucket = [x for x in snpbucket[0].tolist()]

In [0]:
#make 1000 dataframes with a set of snps == 110 = len(outliersnps)
for i in range(20):                                  #make 20 .py files
    for j in range(50):                              #each .py file makes 50 matrices
        snps = random.sample(snpbucket,len(outliersnps))           #select random snps

        DIR = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/randsnps'
        if not opexists(DIR):
            os.makedirs(DIR)
        filE = opjoin(DIR,"outflank_%s_%s_randsnps.txt" % (str(i).zfill(2),str(j).zfill(2)))
        df = pd.DataFrame(snps)
        df.to_csv(filE,header=False,index=False,sep="\t")

In [0]:
df = pd.read_csv(filE,header=None,sep="\t")
df.head()

In [0]:
filE

In [0]:
#make .py files
for i in range(20):
    for j in range(50):
        text = '''from __future__ import division
import sys, os, time
import pandas as pd
import numpy as np
import vcf
import random
from collections import OrderedDict
from operator import itemgetter
from collections import defaultdict
import math
import shutil

filE= '/home/lindb/wbp/OutFLANK/covariances/randmatrices/randsnps/outflank_%s_%s_randsnps.txt'
df = pd.read_csv(filE,header=None,sep="\\t")
snps = df[0].tolist()

newdf = pd.read_csv('/home/lindb/wbp/OutFLANK/imputed_MAF.txt',header=0,index_col=0,sep="\\t")

icount = 0
rDict = OrderedDict()
for i,locusi in enumerate(snps):
    rDict[locusi] = OrderedDict()
    for j,locusj in enumerate(snps):
        if i > j: #i=row, j=col : lower tri
            asums = 0
            bsums = 0
            csums = 0
            for pop in newdf.columns:
                p_i = newdf.loc[locusi,pop]
                p_j = newdf.loc[locusj,pop]
                
                #calc "a"
                product = p_i*p_j
                asums = asums + product
                
                #calc "b"
                bsums = bsums + p_i
                
                #calc "c"
                csums = csums + p_j
            
            a = asums/len(newdf.columns)
            b = bsums/len(newdf.columns)
            c = csums/len(newdf.columns)
            
            d = a - (b*c)
            rDict[locusi][locusj] = d
        else:
            rDict[locusi][locusj] = np.nan
    icount += 1
    if icount %% 10 == 0:
        print icount

filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0outfiles/outflank_%s_%s_imputedDVALS.txt'
DIR = opdirname(filE)
if not opexists(DIR):
    os.makedirs(DIR)
with open(filE,'w') as o:
    line = '\\t'.join(snps) + str('\\n')
    o.write("%%s" %% line)
    for locusi in rDict.keys():
        line = str(locusi) + '\\t' + '\\t'.join([str(x) for x in rDict[locusi].values()]) + str('\\n')
        o.write("%%s" %% line)
    
''' % (str(i).zfill(2),str(j).zfill(2), 
       str(i).zfill(2),str(j).zfill(2))
        
        filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0pyfiles/outflank_%s_%s_imputed.py' % (str(i).zfill(2),
                                                                                                         str(j).zfill(2))
        DIR = opdirname(filE)
        if not opexists(DIR):
            os.makedirs(DIR)
        with open(filE,'w') as o:
            o.write(text)
        o.close()

In [0]:
DIR = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0pyfiles/'
files = ls(DIR)
files = [opjoin(DIR,f) for f in files]
len(files)

In [0]:
files[0]

In [0]:
#make <=198 sh files to include imputed py files too (my qsub limit is 200, I have 2 things going at the moment
fcount =0
shcount =0
tcount =0
newsh = True
for f in sorted(files):
    if newsh == True:
        text = '''#!/bin/bash
#$ -N run%s 
#$ -V
#$ -j y
#$ -cwd
''' % str(shcount).zfill(3)
    newtext = '''
cd %s
python %s
''' % (opdirname(f),opbasename(f))
    text = text + newtext
    
    fcount += 1
    tcount += 1
    newsh = False
    if (fcount == 6) or (tcount == 1000):
        newsh = True
        fcount =0
        filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0runfiles/%s_run.sh' % str(shcount).zfill(3)
        DIR = opdirname(filE)
        if not opexists(DIR):
            os.makedirs(DIR)
        with open(filE,'w') as o:
            o.write(text)
        o.close()
        
        shcount += 1
    if '%max-rad-input_00_00_imputed.py' in f:
        print "shcount",shcount

In [0]:
opdirname(f)

In [0]:
#check one to make sure it worked
df = pd.read_csv('/home/lindb/wbp/OutFLANK/covariances/randmatrices/0outfiles/outflank_18_46_imputedDVALS.txt',header=0,
                index_col = 0, sep="\t")
df.head()

# place observed median in distribution of medians made from random SNPs

In [0]:
DF.head()

In [0]:
# get observed dvals
DF = pd.read_csv('/home/lindb/wbp/OutFLANK/covariances/dvals/imputed_dvals.txt',header=0,index_col=0,sep="\t")
dvals = []
for i,row in enumerate(DF.index):
    for j,col in enumerate(DF.columns):
        if i > j:
            dvals.append(DF.loc[row,col])
            
DIR = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0outfiles'
files = ls(DIR)
files = [f for f in files if f.startswith('outflank')]

pvals = []
fcount = 0
allvals = []
medvals = []
for f in files:
    df = pd.read_csv(opjoin(DIR,f),header=0,index_col=0,sep="\t")
    rvals = [] #random dij values
    for lst in df.values.tolist():
        for x in lst:
            if math.isnan(x) == False:
                rvals.append(x)
                allvals.append(x)
    
    medvals.append(np.median([abs(x) for x in rvals]))
    pvals.append(ks_2samp(rvals,dvals)[1])
    fcount += 1
    if fcount % 10 == 0:
        print fcount

filE = '/home/lindb/wbp/OutFLANK/covariances/randmatrices/0OBSpvals/outflank_imputed_observedpvalues.txt'
pvals = pd.DataFrame(pvals)
pvals.to_csv(filE,header=None,index=False,sep="\t")

In [0]:
sorts = sorted([abs(x) for x in medvals])

In [0]:
len(sorts)*.95

In [0]:
sorts[950]

In [0]:
max(sorts)

In [0]:
plt.hist(medvals)[2] # this is the 1000 median values

In [0]:
med = np.median(dvals)

In [0]:
n5th = sorts[950]

In [0]:
#this is the distribution of median values of random SNPs
#red line is the 95th percentile
plt.hist(sorts)[2] 
plt.axvline(x=n5th,c="red",linewidth=2,zorder=0) 

In [0]:
#this is the distribution of observed Dvals
#red line is the median value
fig = plt.hist([abs(x) for x in dvals])[2]
plt.axvline(x=med,c="red",linewidth=5,zorder=0)

In [0]:
np.median(dvals)

In [0]:
np.median(dvals) > max(sorts)

In [0]:
len(dvals)

In [0]:
np.median(dvals)/max(sorts)

In [0]:
len(dvals)

In [0]:
#below what percentile of observed OBSdvals are the values less than the maximum randranddvals
for i,med in enumerate(sorted(dvals)):
    if not med < max(sorts):
        print i,i/len(dvals)
        break

In [0]:
sorts[950]

In [0]:
for i,med in enumerate(sorted(dvals)):
    if not med < sorts[950]:
        print i,i/len(dvals)
        break

In [0]:
len(loci),len(snpbucket)

# ignore: effect distributions - effects pulled from 11_GEMMA.ipynb

In [0]:
from IPython.display import display

In [0]:
combined_dfs = {}
DIR = '/home/lindb/wbp/gemma/infiles/bslmm/output/'
for f in [op.join(DIR,f) for f in ls(DIR) if 'combined_df.txt' in f]:
    pheno = op.basename(f).split("_")[0]
    combined_dfs[pheno] = pd.read_csv(f,header=0,index_col=0,sep='\t')
    print pheno
    display(combined_dfs[pheno].head())

In [0]:
#get a list of effects across phenotypes for each of the outlier snps
outfx = OrderedDict()
for snp in outliersnps:
    snp = str(snp)
    outfx[snp] = OrderedDict()
    outfx[snp]['alpha'] = []
    outfx[snp]['beta'] = []
    outfx[snp]['total'] = []
    outfx[snp]['gamma'] = []
    for pheno in combined_dfs:
        outfx[snp]['alpha'].append(combined_dfs[pheno].loc[snp,'alpha_hmean'])
        
        outfx[snp]['beta'].append(combined_dfs[pheno].loc[snp,'beta_hmean'])
        
        outfx[snp]['total'].append(combined_dfs[pheno].loc[snp,'total_effect'])
        
        outfx[snp]['gamma'].append(combined_dfs[pheno].loc[snp,'gamma_hmean'])

In [0]:
#get an average for each effect for each snp
for snp in outfx:
    outfx[snp]['mean alpha'] = np.mean([float(a) for a in outfx[snp]['alpha']])
    outfx[snp]['mean beta']  = np.mean([b for b in outfx[snp]['beta']])
    outfx[snp]['mean total'] = np.mean([t for t in outfx[snp]['total']])
    outfx[snp]['mean gamma'] = np.mean([g for g in outfx[snp]['gamma']])

In [0]:
outfx['NODE_1001690_length_90_cov_2.000000_37']

In [0]:
#put the means into a list
outfx['total dist'] = []
outfx['gamma dist'] = []
outfx['alpha dist'] = []
outfx['beta dist'] = []
for snp in outfx:
    if ' ' not in snp:
        outfx['alpha dist'].append(outfx[snp]['mean alpha'])

        outfx['beta dist'].append(outfx[snp]['mean beta'])

        outfx['gamma dist'].append(outfx[snp]['mean gamma'])

        outfx['total dist'].append(outfx[snp]['mean total'])

In [0]:
with PdfPages('/home/lindb/wbp/OutFLANK/figures/effects_4_criteria.pdf') as pdf:
#    crit = 'alpha_hmean'
    
#    plt.close('all')
#    fig , ((a1,a2,a3),(a4,a5,a6)) = plt.subplots(2, 3, figsize=(5,5),dpi=400)
    fig = plt.figure()
    ax = fig.add_subplot(111)
#    plotdict[count] = plt.subplot(int("23%s" % count))
    a = outfx['alpha dist']
    b = outfx['beta dist']
    c = outfx['gamma dist']
    d = outfx['total dist']
    bins=np.histogram(np.hstack((a,b,c,d)), bins=40)[1]
    ax.hist(a,label='alpha (%s)' % str(round(np.median(a),8)),alpha=0.5,bins=bins)
    ax.hist(b,label='beta (%s)' % str(round(np.median(b),5)),alpha=0.5,bins=bins)
    ax.hist(c,label='gamma (%s)' % str(round(np.median(c),5)),alpha=0.5,bins=bins)
    ax.hist(d,label='total (%s)' % str(round(np.median(d),5)),alpha=0.5,bins=bins)
    
#    plt.legend(['toppips','snpdict999','top alphas'])
    plt.legend()
#    plotdict[count].set_title('%s alpha' % pheno,y=.9,loc='left',fontsize=10,fontweight='bold')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xlabel('effect size')
    ax.set_ylabel('count')
    ax.locator_params(axis='x',nbins=4)

    fig.set_size_inches(7,5)
    pdf.savefig(fig,bbox_inches='tight')


In [0]:
plt.hist(a,label='alpha dist',alpha=0.5,bins=bins)[2]

In [0]:
plt.hist(b,label='beta dist',alpha=0.5,bins=bins)[2]

In [0]:
plt.hist(c,label='gamma dist',alpha=0.5,bins=bins)[2]

In [0]:
plt.hist(d,label='total dist',alpha=0.5,bins=bins)[2]

In [0]:
nonoutlier = {}
for pheno in combined_dfs:
    loci = combined_dfs[pheno].index.tolist()
    nonoutlier[pheno] = set(loci)-set(outliersnps)
    print len(nonoutlier[pheno]),len(loci),len(loci)-110

In [0]:
len(H.index)

In [0]:
nonoutliers = set(H.index.tolist()) - set(outliersnps)
len(nonoutliers)

In [0]:
snpcount = 0
nonfx = OrderedDict()
for snp in nonoutliers:
    if not snp in nonfx.keys():
        nonfx[snp] = OrderedDict()
        nonfx[snp]['alpha'] = []
        nonfx[snp]['beta']  = []
        nonfx[snp]['total'] = []
        nonfx[snp]['gamma'] = []

    [nonfx[snp]['alpha'].append(combined_dfs[pheno].loc[snp,'alpha_hmean']) for pheno in combined_dfs if snp in combined_dfs[pheno].index]

    [nonfx[snp]['beta'].append(combined_dfs[pheno].loc[snp,'beta_hmean']) for pheno in combined_dfs if snp in combined_dfs[pheno].index]

    [nonfx[snp]['total'].append(combined_dfs[pheno].loc[snp,'total_effect']) for pheno in combined_dfs if snp in combined_dfs[pheno].index]

    [nonfx[snp]['gamma'].append(combined_dfs[pheno].loc[snp,'gamma_hmean']) for pheno in combined_dfs if snp in combined_dfs[pheno].index]
    
    snpcount += 1
    if snpcount % 1000 == 0:
        print snpcount

In [0]:
#write out the file since it took so GD long to make
import pickle
with open('/home/lindb/wbp/OutFLANK/nonoutlier_effects.pkl',"wb") as o:
    pickle.dump(nonfx, o, pickle.HIGHEST_PROTOCOL)

In [0]:
#get an average for each effect for each snp
for snp in nonfx:
    nonfx[snp]['mean alpha'] = np.mean([float(a) for a in nonfx[snp]['alpha']])
    nonfx[snp]['mean beta']  = np.mean([b for b in nonfx[snp]['beta']])
    nonfx[snp]['mean total'] = np.mean([t for t in nonfx[snp]['total']])
    nonfx[snp]['mean gamma'] = np.mean([g for g in nonfx[snp]['gamma']])

In [0]:
#put the means into a list
nonfx['total dist'] = []
nonfx['gamma dist'] = []
nonfx['alpha dist'] = []
nonfx['beta dist'] = []
for snp in nonfx:
    if ' ' not in snp:
        nonfx['alpha dist'].append(nonfx[snp]['mean alpha'])

        nonfx['beta dist'].append(nonfx[snp]['mean beta'])

        nonfx['gamma dist'].append(nonfx[snp]['mean gamma'])

        nonfx['total dist'].append(nonfx[snp]['mean total'])

In [0]:
with PdfPages('/home/lindb/wbp/OutFLANK/figures/nonoutlier_effects_4_criteria.pdf') as pdf:
#    crit = 'alpha_hmean'
    
#    plt.close('all')
#    fig , ((a1,a2,a3),(a4,a5,a6)) = plt.subplots(2, 3, figsize=(5,5),dpi=400)
    fig = plt.figure()
    ax = fig.add_subplot(111)
#    plotdict[count] = plt.subplot(int("23%s" % count))
    a = [A for A in nonfx['alpha dist'] if math.isnan(A)==False]
    b = [B for B in nonfx['beta dist'] if math.isnan(B)==False]
    c = [C for C in nonfx['gamma dist'] if math.isnan(C)==False]
    d = [D for D in nonfx['total dist'] if math.isnan(D)==False]
    bins=np.histogram(np.hstack((a,b,c,d)), bins=40)[1]
    ax.hist(a,label='alpha (%s)' % str(round(np.median(a),8)),alpha=0.5,bins=bins)
    ax.hist(b,label='beta (%s)' % str(round(np.median(b),5)),alpha=0.5,bins=bins)
    ax.hist(c,label='gamma (%s)' % str(round(np.median(c),5)),alpha=0.5,bins=bins)
    ax.hist(d,label='total (%s)' % str(round(np.median(d),5)),alpha=0.5,bins=bins)
    
#    plt.legend(['toppips','snpdict999','top alphas'])
    plt.legend()
#    plotdict[count].set_title('%s alpha' % pheno,y=.9,loc='left',fontsize=10,fontweight='bold')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xlabel('effect size')
    ax.set_ylabel('count')
    ax.locator_params(axis='x',nbins=4)

    fig.set_size_inches(7,5)
    pdf.savefig(fig,bbox_inches='tight')


In [0]:
max(b)

In [0]:
from scipy.stats import wilcoxon

In [0]:
for crit in ['alpha dist','beta dist','gamma dist','total dist']:
    print crit
    o = [B for B in outfx[crit] if math.isnan(B)==False]
    n = [A for A in nonfx[crit] if math.isnan(A)==False]
    print "o=",np.median(o)
    print "n=",np.median(n)

In [0]:
len(o),len(n)

In [0]:
from scipy.stats.mstats import kruskalwallis

In [0]:
for crit in ['alpha dist','beta dist','gamma dist','total dist']:
    print crit
    o = [B for B in outfx[crit] if math.isnan(B)==False]
    n = [A for A in nonfx[crit] if math.isnan(A)==False]
    k = kruskalwallis(o,n)
    print crit,k
    print "max o",max(o)
    print "max n",max(n)

In [0]:
df = pd.read_csv('/home/lindb/wbp/OutFLANK/OutFlank_results2.txt',header=0,sep='\t')
df.head()

In [0]:
min(df['He'])

In [0]:
df2 = pd.DataFrame(df[df['OutlierFlag'] == True])
len(df2.index)

In [0]:
df2['He'].describe()