In [0]:
import os
from collections import OrderedDict,Counter
from __future__ import division
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as mtick
%matplotlib inline

#make SNPmat

In [0]:
#get the hierftrans for all SNPS, created above as infile for multilocus FST
filE = '/home/lindb/eckertlab/wbp/hierfstat/hiertransIDXHEADER_imputed.txt'
imptrans = pd.read_csv(filE,header=0,index_col=0,sep="\t")
imptrans.head()

In [0]:
#get pop assignment for each samp
filE = '/home/lindb/eckertlab/wbp/hierfstat/sampsTOpop.txt'
stp = pd.read_csv(filE,header=0,index_col='samp',sep="\t")
stp.head()

In [0]:
merged = pd.merge(imptrans,stp,left_index=True,right_index=True)
cols = ['pop'] + [col for col in merged.columns if 'NODE' in col]
merged = merged[cols]
merged.head()

In [0]:
len(merged.columns)

In [0]:
#replace hierftrans to 0,1,or 2
merged = merged.replace([11,12,22],[0,1,2])
merged.head()

In [0]:
#make sure there aren't any weird data in the dataframe - like 'NAs' or 'np.nan'
for col in merged.columns:
    uni = np.unique(merged[col].tolist()).tolist()
    if (uni == [0,1]) or (uni == [0,1,2]) or (uni == [1,2]) or (uni == [0,2]):
        1+1
    else:
        print col, np.unique(merged[col].tolist()).tolist()

In [0]:
cols = [col for col in merged.columns if 'NODE' in col]
snpmat = merged[cols]
snpmat.head()

In [0]:
pops = pd.DataFrame(merged['pop'].tolist())
pops.head()

In [0]:
filE = '/home/lindb/wbp/OutFLANK/imputed/SNPmat_HEADERIDX.txt'
filE2 = '/home/lindb/wbp/OutFLANK/imputed/SNPmat_noHEADERIDX.txt'
snpmat.to_csv(filE,header=True,index=True,sep="\t")
snpmat.to_csv(filE2,header=None,index=False,sep="\t")

popfile = '/home/lindb/wbp/OutFLANK/imputed/SNPmat_popNames.txt'
pops.to_csv(popfile,header=True,index=False,sep="\t")

locfile = '/home/lindb/wbp/OutFLANK/imputed/SNPmat_locusNames.txt'

In [0]:
locfile = '/home/lindb/wbp/OutFLANK/imputed/SNPmat_locusNames.txt'
cols = pd.DataFrame(snpmat.columns)
cols.to_csv(locfile,header=None,index=False,sep="\t")

In [0]:
len(cols.index)

# Put into R
```R
library(OutFLANK)
library(data.frame)

SNPmat = data.frame(fread('/home/lindb/wbp/OutFLANK/imputed/SNPmat_noHEADERIDX.txt',header=F,sep="\t"))

locusNames = read.csv('/home/lindb/wbp/OutFLANK/imputed/SNPmat_locusNames.txt',header=F,sep="\t")

popNames = read.csv('/home/lindb/wbp/OutFLANK/imputed/SNPmat_popNames.txt',header=F,sep="\t")

FstDataFrame = MakeDiploidFSTMat(SNPmat,locusNames,popNames)

out = OutFLANK(FstDataFrame = FstDataFrame,NumberOfSamples = 8)

df = out$results

outliers = df[which(df$OutlierFlag == 'TRUE'),]

loci = outliers$LocusName

write.table(df,'/home/lindb/wbp/OutFLANK/imputed/OutFLANK_results.txt',row.names=F,col.names=T,sep='\t')

write.table(loci,'/home/lindb/wbp/OutFLANK/imputed/OutFLANK_snps.txt',row.names=F,sep='\t')

```

In [0]:
outliers = pd.read_csv('/home/lindb/wbp/OutFLANK/imputed/OutFLANK_snps.txt')
outliers

In [0]:
outlierloci = outliers['x'].tolist()
outlierloci

#get bayenv2 loci

In [0]:
#get a list of files with snps IDed by bayenv2
DIR = '/home/lindb/wbp/bayenv2/Final/imputed/'
files = os.listdir(DIR)
files = [os.path.join(DIR,f) for f in files if not f.startswith('imputed')]
len(files)

In [0]:
#make a list of snps across envs
snpLST = []
for f in files:
    env = os.path.basename(f).split("_")[0]
    df = pd.read_csv(f,header=0,sep="\t")
    loci = df[env].tolist()
    for locus in loci:
        if not locus in snpLST:
            snpLST.append(locus)

In [0]:
len(snpLST)

In [0]:
#see if the outlier loci are in the bayenv2 list (snpLST)
for locus in outlierloci:
    if locus in snpLST:
        print locus,'in snpLST'  

#get piMASS loci

In [0]:
#get a list of files with snps IDed by piMASS
DIR = '/home/lindb/wbp/piMASS/analyses2/7xstringent'
files = os.listdir(DIR)
files = [os.path.join(DIR,f) for f in files if 'imp' in f]
len(files)

In [0]:
psnpLST = []
for f in files:
    pheno = os.path.basename(f).split("_")[:-1]
    df = pd.read_csv(f,header=0,sep="\t")
    loci = df['rs'].tolist()
    for locus in loci:
        if not locus in psnpLST:
            psnpLST.append(locus)
len(psnpLST)

In [0]:
#see if the outlier loci are in the bayenv2 list (snpLST)
for locus in outlierloci:
    if locus in psnpLST:
        print locus,'in psnpLST'

#look at imputed/missing overlap

In [0]:
missing = pd.read_csv('/home/lindb/wbp/OutFLANK/missing/OutFLANK_snps.txt')
misloci = missing['x'].tolist()
misloci

In [0]:
for mlocus in misloci:
    for ilocus in outlierloci:
        if ilocus == mlocus:
            print ilocus

#Covariance among identified SNPs