In [0]:
import os
from collections import OrderedDict,Counter
from __future__ import division
import pandas as pd
import numpy as np
import vcf
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib.patches as mpatches
%matplotlib inline
import os.listdir as ls
import os.path as op

In [0]:
#get pop info
test = pd.read_csv('/home/lindb/wbp/WBP_IDS_MATCHED_POP_FINAL_02162016.csv',header=0,sep="\t")
test.head()

In [0]:
len(np.unique(test['UCD_ID']))

In [0]:
#get pop info
stpDict = OrderedDict()
for row in test.index:
    ID = test.loc[row,'UCD_ID'][-3:] + 'compiled'
    if ID not in stpDict.keys():
        stpDict[ID] = OrderedDict()
    stpDict[ID]['pop'] = test.loc[row,'Population_ID']
    stpDict[ID]['plot'] = str(test.loc[row,'Plot_id'])
len(stpDict.keys())    

In [0]:
#write out for future use
filE = '/home/lindb/wbp/sampsTOpop.txt'
with open(filE,'w') as o:
    text = '\t'.join(['sampID','pop','plot'])+'\n'
    o.write("%s" % text)
    for samp in sorted(stpDict.keys()):
        text = '\t'.join([samp,stpDict[samp]['pop'],stpDict[samp]['plot']])+'\n'
        o.write("%s" % text)
stp = pd.read_csv(filE,header=0,sep="\t")
stp.head()

In [0]:
stp = pd.read_csv('/home/lindb/wbp/sampsTOpop.txt',header=0,sep="\t")
stp.head()

In [0]:
stpDict = OrderedDict()
for row in stp.index:
    samp = stp.loc[row,'sampID']
    stpDict[samp] = OrderedDict()
    stpDict[samp]['pop'] = stp.loc[row,'pop']
    stpDict[samp]['plot'] = stp.loc[row,'plot']

In [0]:
#convert vcf.012 to data.frame
def get_z12_df(z12_file):
    indv_file = "%s.indv" % z12_file
    pos_file = "%s.pos" % z12_file
    z12_data = []
    for i, line in enumerate(open(z12_file)):
        line = line.strip()
        line = [int(x) for x in line.split("\t")]
        z12_data.append(np.array(line))
    z12_data = np.array(z12_data)
    p = pd.read_csv(pos_file, sep="\t", names=['contig', 'pos'])
    df = pd.DataFrame(z12_data)
    df = df.drop(0, axis=1)
    df.columns = p.apply(lambda x: "%s_%s" % (x.contig, x.pos), axis=1)
    df.index = [x.strip() for x in open(indv_file).readlines()]    
    return df

In [0]:
misdir = '/home/lindb/wbp/concatenated/snps/'
impdir = '/home/lindb/wbp/concatenated/snps/beagle40/'
misz12 = op.join(misdir,'isect_one_per_contig.recode.vcf.gz_sorted.vcf.gz.012')
impz12 = op.join(impdir,'isect_one_per_contig.recode.vcf.gz_sorted.vcf.gz.012')
z12s = [impz12,misz12]

In [0]:
z12_dfs = [get_z12_df(z) for z in z12s]

In [0]:
z12_dfs[0][z12_dfs[0].columns[-3:]].head()

In [0]:
z12_dfs[1][z12_dfs[1].columns[-3:]].head()

In [0]:
'249compiled' in z12_dfs[0].index

In [0]:
[z.shape for z in z12_dfs]

In [0]:
for z in z12_dfs:
    for samp in z.index:
        z.loc[samp,'population'] = stpDict[samp]['pop']
        #print samp,stpDict[samp]

In [0]:
z12_dfs[0].head()

In [0]:
z12_dfs[0][z12_dfs[0].columns[-3:]].head()

In [0]:
z12_dfs[1][z12_dfs[1].columns[-3:]].head()

In [0]:
[z.shape for z in z12_dfs]

In [0]:
def get_percent_missing(col):
    return len(col[col==-1])*1.0/len(col)   

In [0]:
%time p_mis = z12_dfs[1].apply(get_percent_missing,axis=0)

In [0]:
p_mis.describe()
#count    116232.000000
#mean          0.350268
#std           0.090316
#min           0.000000
#25%           0.282787
#50%           0.356557
#75%           0.426230
#max           0.500000
#dtype: float64

In [0]:
%time p_imp = z12_dfs[0].apply(get_percent_missing,axis=0)

In [0]:
p_imp.describe() #should be 0 since it's the imputed set
#count    116232
#mean          0
#std           0
#min           0
#25%           0
#50%           0
#75%           0
#max           0
#dtype: float64

In [0]:
flz = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12.txt','/home/lindb/wbp/workingsnps/missing/missing_z12.txt']
for i,z in enumerate(z12_dfs):
    z.to_csv(flz[i],header=True,index=True,sep="\t")

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus, debug):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    if total_alleles == 0:
        return None
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    
        
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})
    if debug:
        print(ret)
    return ret

In [0]:
z12_freqs = [z.ix[:,:-1].apply(get_allele_freqs,debug=False) for z in z12_dfs]

In [0]:
[z.shape for z in z12_freqs]

In [0]:
z12_freqs[0].head()

In [0]:
mafs = [z.apply(lambda x: min(x["p"], x["q"])) for z in z12_freqs]

In [0]:
mafs[0].head()

In [0]:
mafs[1].head()

In [0]:
min(mafs[0].tolist()),max(mafs[0].tolist())

In [0]:
min(mafs[1].tolist()),max(mafs[1].tolist())

In [0]:
flz = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf.txt','/home/lindb/wbp/workingsnps/missing/missing_z12_maf.txt']
for i,z in enumerate(mafs):
    print i
    m = pd.DataFrame(z)
    m.to_csv(flz[i],header=False,index=True,sep="\t")

In [0]:
plt.scatter(mafs[0], mafs[1])
plt.title("MAF")
plt.xlabel("not imputed")
plt.ylabel("imputed")
plt.show()

In [0]:
#swap for minor allele
def swap_alleles(locus, af):
    if "_" in locus.name:
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        if maf == freqs["p"]:
            return locus.replace({0:2,2:0})
        return locus
    else:
        return locus

In [0]:
z12_swapped = []
for i, z in enumerate(z12_dfs):
    print i
    z12_swapped.append(z.apply(swap_alleles, args=(z12_freqs[i],)))

In [0]:
z12_dfs[0].head()

In [0]:
z12_swapped[0].head()

In [0]:
swaps = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp.txt','/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp.txt']
for i,z in enumerate(z12_swapped):
    print i
    z12_swapped[i].to_csv(swaps[i],header=True,index=True,sep="\t")

In [0]:
swaps = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp.txt','/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp.txt']
z12_swapped = []
for i,s in enumerate(swaps):
    z12_swapped.append(pd.read_csv(s,header=0,index_col=0,sep="\t"))

In [0]:
z12_swapped[0].head()

In [0]:
z12_swapped[1].head()

In [0]:
#while i'm at this point, might as well make files for hierfstat
hierf_trans = {0:'11', 1:'12', 2:'22', -1:'NA'}
def apply_hierf_trans(series):
    return [hierf_trans[x] for x in series]

In [0]:
#while i'm at this point, might as well make files for hierfstat
%time ztrans = [z.ix[:,:-1].apply(apply_hierf_trans) for z in z12_swapped]

In [0]:
len(ztrans[0].columns)

In [0]:
np.unique(ztrans[1][ztrans[1].columns[0]]) #missing

In [0]:
np.unique(ztrans[0][ztrans[0].columns[0]]) #imputed should not have "NA"

In [0]:
pops = sorted(np.unique(z12_swapped[0]['population']))
pops

In [0]:
pops = ['Dicks_Pass',
 'Freel_Peak',
 'Heavenly',
 'Little_Round_Top',
 'Mt_Rose_Ophir',
 'Rifle_Peak',
 'Snow_Valley_Peak',
 'West_Shore_Peaks']

In [0]:
popids = OrderedDict()
for i,pop in enumerate(pops):
    i+=1
    popids[pop] = i
    print pop,i

In [0]:
stpDict[stpDict.keys()[0]]

In [0]:
ztrans[0].head()

In [0]:
ztrans[1].head()

In [0]:
#while i'm at this point, might as well make files for hierfstat
for z in ztrans:
    for samp in z.index:
        z.loc[samp,'popid']  = popids[stpDict[samp]['pop']]
        z.loc[samp,'plotid'] = stpDict[samp]['plot']

In [0]:
ztrans[0][ztrans[0].columns[-3:]].head()

In [0]:
ztrans[1][ztrans[1].columns[-3:]].head()

In [0]:
ztrans[0].columns[-10:-2]

In [0]:
#this was executed
for i,z in enumerate(ztrans):
    z.sort_index(inplace=True)

In [0]:
cols = [['popid','plotid'],['popid','plotid']]
for i,z in enumerate(ztrans):
    cols[i].extend([c for c in z.columns[:-2]])
    ztrans[i] = ztrans[i][[x for x in cols[i]]]

In [0]:
len(cols[0])

In [0]:
ztrans[0].head()

In [0]:
ztrans[1].head()

In [0]:
fs = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans.txt',
      '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans.txt']
for i,z in enumerate(ztrans):
    print i,fs[i]

In [0]:
#while i'm at this point, might as well make files for hierfstat
fs = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans.txt',
      '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans.txt']
for i,z in enumerate(ztrans):
    print i
    z.to_csv(fs[i],header=True,index=True,sep="\t")

In [0]:
fs = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans.txt',
      '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans.txt']
ztrans = OrderedDict()
for i,f in enumerate(fs):
    ztrans[dset[i]] = pd.read_csv(f,header=0,index_col=0,sep="\t")  

In [0]:
ztrans.keys()

In [0]:
ztrans['mis'].fillna("NA",inplace=True)
ztrans['mis'].replace([11,12,22],['11','12','22'],inplace=True)
ztrans['mis'].head()

In [0]:
ztrans['imp'].replace([11,12,22],['11','12','22'],inplace=True)
ztrans['imp'].head()

In [0]:
#rewrite just in case
fs = ['/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans.txt',
      '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans.txt']
for i,f in enumerate(fs):
    ztrans[dset[i]].to_csv(f,header=True,index=True,sep="\t")

In [0]:
ztrans['imp'].to_csv('/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans.txt',header=True,index=True,sep="\t")

In [0]:
np.unique(ztrans['imp'][ztrans['imp'].columns[3]])

In [0]:
np.unique(ztrans['mis'][ztrans['mis'].columns[10]]) #missing should have "NAs"

In [0]:
#get back to -1,0,1,2 from hierfstat_trans
z12_trans = {'NA':-1, '11':0, '12':1, '22':2}
def convert_to_z12(col):
    return [z12_trans[x] for x in col]

In [0]:
z12_trans[ztrans['mis'].loc['009compiled','NODE_1000013_length_91_cov_1.802198_37']]

In [0]:
hierf_z12 = OrderedDict()
%time hierf_z12['mis'] = ztrans['mis'].ix[:,2:].apply(convert_to_z12)

In [0]:
hierf_z12['mis'].head()

In [0]:
%time hierf_z12['imp'] = ztrans['imp'].ix[:,2:].apply(convert_to_z12)

In [0]:
for dset in hierf_z12.keys():
    print hierf_z12[dset].shape

In [0]:
np.unique(hierf_z12['mis'][hierf_z12['mis'].columns[0]]) #should be -1 in missing

In [0]:
np.unique(hierf_z12['imp'][hierf_z12['imp'].columns[0]])

In [0]:
hierf_z12['imp'].head()

In [0]:
fs = OrderedDict()
fs['imp'] = '/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans_z12.txt'
fs['mis'] = '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans_z12.txt'
for dset in hierf_z12:
    print dset
    hierf_z12[dset].to_csv(fs[dset],header=True,index=True,sep="\t")

In [0]:
fs = OrderedDict()
fs['imp'] = '/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans_z12.txt'
fs['mis'] = '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans_z12.txt'
hierf_z12 = OrderedDict()
for dset in fs.keys():
    hierf_z12[dset] = pd.read_csv(fs[dset],header=0,index_col=0,sep="\t")

In [0]:
hierf_z12['imp'].head()

In [0]:
hierf_z12['mis'].head()

# now the actual PCA

In [0]:
popids

In [0]:
for dset in hierf_z12:
    for samp in hierf_z12[dset].index:
        hierf_z12[dset].loc[samp,'population'] = popids[stpDict[samp]['pop']]
        hierf_z12[dset].loc[samp,'plot'] = stpDict[samp]['plot']

In [0]:
'plot' in hierf_z12['imp'].columns

In [0]:
'plot' in hierf_z12['mis'].columns

In [0]:
hierf_z12['imp'].head()

In [0]:
hierf_z12['mis'].head()

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

def center_and_standardize(locus, af):
    if "NODE" in locus.name:
        #locus_id = int(locus.name[1:])
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        var = maf*(1-maf)
        u = np.mean([x for x in locus if x != -1])
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
temp_freqs = z12_freqs

In [0]:
z12_freqs = OrderedDict()
z12_freqs['imp'] = temp_freqs[0]
z12_freqs['mis'] = temp_freqs[1]

In [0]:
pca_std = OrderedDict()
%time
for dset in hierf_z12.keys():
    pca_std[dset] = hierf_z12[dset].apply(center_and_standardize, args=(z12_freqs[dset],))

In [0]:
hierf_z12.keys()

In [0]:
%time
fs = OrderedDict()
fs['imp'] = '/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans_z12_pca_std.txt'
fs['mis'] = '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans_z12_pca_std.txt'
for i,dset in enumerate(pca_std.keys()):
    pca_std[dset].to_csv(fs[dset],header=True,index=True,sep="\t")

In [0]:
%time
pca_std = OrderedDict()
dsets = ['imp','mis']
for i,dset in enumerate(dsets):
    pca_std[dset] = pd.read_csv(fs[dset],header=0,index_col=0,sep="\t")

In [0]:
pca_std['mis'].head()

In [0]:
pca_std['imp'].head()

In [0]:
len(pca_std['mis'].columns)

In [0]:
len(pca_std['imp'].columns)

In [0]:
pca_std.keys()

In [0]:
pca_std_data = OrderedDict()
for dset in pca_std.keys():
    cols = [col for col in pca_std[dset].columns if 'NODE' in col]
    pca_std_data[dset] = pca_std[dset].ix[:,[col for col in cols]]

In [0]:
pca_std_data['imp'].shape

In [0]:
pca_std_data['imp'].head()

In [0]:
pca_std_data['mis'].head()

In [0]:
len(pca_std_data['imp'].columns)

In [0]:
fs = OrderedDict()
fs['imp'] = '/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans_z12_pca_std.txt'
fs['mis'] = '/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans_z12_pca_std.txt'
for i,dset in enumerate(pca_std_data.keys()):
    pca_std_data[dset].to_csv(fs[dset],header=True,index=True,sep="\t")

In [0]:
pca_std_data.keys()

## R magic voodoo

In [0]:
def setup_r(): 
    os.environ['R_HOME'] = '/home/lindb/g/R3/lib64/R/' 
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'])

In [0]:
setup_r()
import readline
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri as pd2ri
pd2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext rpy2.ipython
%matplotlib inline

In [0]:
%%R
sessionInfo()

In [0]:
%%R
library(data.table)
data_mis = fread('/home/lindb/wbp/workingsnps/missing/missing_z12_maf_swp_trans_z12_pca_std.txt', sep="\t", data.table=FALSE)
data_imp = fread('/home/lindb/wbp/workingsnps/imputed/imputed_z12_maf_swp_trans_z12_pca_std.txt', sep="\t", data.table=FALSE)

In [0]:
%%R
mis_dir = '/home/lindb/wbp/workingsnps/missing/'
imp_dir = '/home/lindb/wbp/workingsnps/imputed/'
rownames(data_mis) = data_mis$V1

rownames(data_imp) = data_imp$V1
drops = c("V1")
data_mis = data_mis[,!(names(data_mis) %in% drops)]
data_imp = data_imp[,!(names(data_imp) %in% drops)]
res_mis = prcomp(data_mis, scale=F, center=F)
res_imp = prcomp(data_imp, scale=F, center=F)
rownames(res_mis$x) = rownames(data_mis)

rownames(res_imp$x) = rownames(data_imp)
fname = 'pca_res.rds'
mis = paste(mis_dir, "/", fname, sep='')
imp = paste(imp_dir, "/", fname, sep='')
saveRDS(res_mis, mis)
saveRDS(res_imp, imp)

In [0]:
mis_dir = '/home/lindb/wbp/workingsnps/missing/'
imp_dir = '/home/lindb/wbp/workingsnps/imputed/'

In [0]:
r("res_mis = readRDS('%s/pca_res.rds')" % mis_dir);
r("res_imp = readRDS('%s/pca_res.rds')" % imp_dir);

In [0]:
def get_pca_x(res):
    x = pd.DataFrame(pd2ri.ri2py(res.rx2("x")))
    x.index = res.rx2("x").names[0]
    x.columns = res.rx2("x").names[1]
    return x

In [0]:
print(r('res_mis').rx2('x').names[0])

In [0]:
summary = r('summary')

In [0]:
prcomp_res = OrderedDict()
prcomp_res['imp'] = r['res_imp']
prcomp_res['mis'] = r['res_mis']

In [0]:
pca_x = OrderedDict()
for dset in prcomp_res.keys():
    pca_x[dset] = get_pca_x(prcomp_res[dset])
    pca_x[dset].index = pca_std_data[dset].index

In [0]:
pca_x['mis']

In [0]:
pca_x['imp']

In [0]:
pca_x['mis'].shape

In [0]:
ntpDict = OrderedDict()
for i,pop in enumerate(sorted(pops)):
    i+=1
    ntpDict[pop] = i
    print pop, i

In [0]:
pca_std['imp'].head()

In [0]:
pca_std['mis'].head()

In [0]:
for ro in pca_std['imp'].index:
    print ro
    break

In [0]:
len(pca_std_data['imp'].columns)

In [0]:
norm = mcolors.Normalize(min(popids.values()), max(popids.values()))
def plot_pca(key, pca_std, pca_std_data, pca_x, prcomp_res):
    joined = pca_std.join(pca_x)
    legend = {}
    for samp in joined.index:
        pop = ntpDict[stpDict[samp]['pop']]
        n = norm(pop)
        color = cm.rainbow(n)
        legend[stpDict[samp]['pop']] = color
        plt.scatter(joined.loc[samp,'PC1'],
                    joined.loc[samp,'PC2'], 
                    s=50, 
                    c=color)
    fig = plt.gcf()
    ax = plt.gca()
    cmap = plt.get_cmap()
    fig.set_size_inches(10,8)
    if key == 'imp':
        title = "PCA of n=%d samples (imputed w beagle) on %d loci" % (len(joined), len(pca_std_data.columns))
    else:
        title = "PCA of n=%d samples (imputed w mean) on %d loci" % (len(joined), len(pca_std_data.columns))
    plt.title("%s" % title)
    imp = summary(prcomp_res).rx("importance")[0]
    plt.xlabel("PC1 (%g)" % imp.rx(2,1)[0])
    plt.ylabel("PC2 (%g)" % imp.rx(2,2)[0])

    handles = []
    for pop in sorted(legend):
        handles.append(mpatches.Patch(color=legend[pop], label=pop))
    plt.legend(handles=handles,loc=2,bbox_to_anchor=(1, 1))
    plt.show()

In [0]:
for key in pca_std.keys():
    plot_pca(key,pca_std[key],pca_std_data[key],pca_x[key],prcomp_res[key])

In [0]:
ntpDict[pop]

In [0]:
%%R
source("/home/lindb/g/ipython/tw_calc.R")
twtable=read.table("/home/lindb/g/ipython/twTable.txt", header=F)
tw_mis = TWcalc(as.matrix(data_mis),20)
tw_imp = TWcalc(as.matrix(data_imp),20)

In [0]:
tws = OrderedDict()
tws['mis'] = r("tw_mis[[2]]")
tws['imp'] = r("tw_imp[[2]]")

In [0]:
def get_sig_tracywidom(tw_p):
    ps = []
    for i, p in enumerate(tw_p):
        if p > 0.05:
            print(i, p)
            break
        else:
            ps.append(p)
    return len(ps), ps

In [0]:
#tw_num = [get_sig_tracywidom(x) for x in tws]
tw_num = OrderedDict()
for dset in tws.keys():
    tw_num[dset] = get_sig_tracywidom(tws[dset])

In [0]:
for dset in tw_num.keys():
    print dset,tw_num[dset]
#mis (3, [8.0000000000000005e-09, 8.0000000000000005e-09, 8.0000000000000005e-09])
#imp (4, [8.0000000000000005e-09, 8.0000000000000005e-09, 8.0000000000000005e-09, 8.0000000000000005e-09])

### Tracy Widom
    imputed = 4
    missing = 3

In [0]:
tw_num

In [0]:
pca_std_data['imp']

In [0]:
pca_std['imp']

In [0]:
pca_x['imp']

In [0]:
pca_x['mis']

In [0]:
plt.scatter(pca_x['mis']['PC1'],pca_x['imp']['PC1'])

In [0]:
plt.scatter(pca_x['mis']['PC2'],pca_x['imp']['PC2'])

In [0]:
from scipy.stats import spearmanr

In [0]:
spearmanr(pca_x['imp']['PC1'],pca_x['mis']['PC1'])

In [0]:
#figure out amount of missing data per individual across snps
misDict = OrderedDict()
df = hierf_z12['mis']
for samp in df.index:
    data = df.loc[samp,:]
    missingdata = data.tolist().count(-1)
    pmissing = missingdata/len(data)
    misDict[samp] = pmissing

In [0]:
len(misDict.keys())

In [0]:
misDict['009compiled']

In [0]:
max(misDict.values())

In [0]:
for samp in misDict.keys():
    print samp, misDict[samp]

In [0]:
pd.DataFrame([float(x) for x in misDict.values()]).describe()

In [0]:
plt.hist([float(x) for x in misDict.values()])

In [0]:
misDict.keys() == pca_x['imp'].index

In [0]:
misDict.keys() == pca_x['mis'].index

In [0]:
color = misDict.values()

In [0]:
plt.scatter(pca_x['mis']['PC1'],pca_x['imp']['PC1'],c=color)

In [0]:
popcols = []
for samp in pca_x['imp'].index:
    popcols.append(popids[stpDict[samp]['pop']])

In [0]:
plt.scatter(pca_x['mis']['PC1'],pca_x['imp']['PC1'],c=popcols)