In [None]:
from glob import glob
from os import path
import re
from skbio import DistanceMatrix
import pandas as pd
import numpy as np

from kwipexpt import *
%matplotlib inline
%load_ext rpy2.ipython

In [None]:
expts = list(map(lambda fp: path.basename(fp.rstrip('/')), glob('data/*/')))
print("Expts:", expts[:10])


In [None]:
def process_expt(expt):
    expt_results = []
    
    def extract_info(filename):
        return re.search(r'kwip/(\d\.?\d*)x-(0\.\d+)-(wip|ip).dist', filename).groups()
    
    # dict of scale: distance matrix, populated as we go
    truths = {}
    
    for distfile in glob("data/{}/kwip/*.dist".format(expt)):
        cov, scale, metric = extract_info(distfile)
        if scale not in truths:
            genome_dist_path = 'data/{ex}/all_genomes-{sc}.dist'.format(ex=expt, sc=scale)
            truths[scale] = load_sample_matrix_to_runs(genome_dist_path)
        exptmat = DistanceMatrix.read(distfile)
        rho = spearmans_rho_distmats(exptmat, truths[scale])
        expt_results.append({
            "coverage": cov,
            "scale": scale,
            "metric": metric,
            "rho": rho,
        })
    return expt_results

# process_expt('3662')

In [None]:
results = []
for res in map(process_expt, expts):
    results.extend(res)
results = pd.DataFrame(results)

In [None]:
%%R -i results
library(plyr)
library(ggplot2)

results$coverage = as.numeric(as.character(results$coverage))
results$scale = as.numeric(as.character(results$scale))

In [None]:
%%R

dat = subset(results, scale==0.001 & coverage<=30, select=c(rho, metric, coverage))
dat$coverage = as.factor(dat$coverage)

ggplot(dat, aes(x=coverage, y=rho, fill=metric)) +
    geom_boxplot(aes(fill=metric))
    

In [None]:
%%R
cover = subset(results, scale==0.001 & coverage <= 50, select=-scale)

summ = ddply(cover, .(coverage, metric), summarise, rho_av=mean(rho), rho_sd=sd(rho))
ggplot(summ, aes(x=coverage, y=rho_av, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd, group=metric)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric), alpha=0.2) +
    xlab('Genome Coverage') +
    ylab(expression(paste("Spearman's ", rho))) +
    #scale_x_log10()+
    theme_bw()

In [None]:
%%R
variation = subset(results, coverage == 30, select=-coverage)

summ = ddply(variation, .(scale, metric), summarise, rho_av=mean(rho), rho_sd=sd(rho))
ggplot(summ, aes(x=scale, y=rho_av, ymin=rho_av-rho_sd, ymax=rho_av+rho_sd, group=metric)) +
    geom_line(aes(linetype=metric)) +
    geom_ribbon(aes(fill=metric), alpha=0.2) +
    xlab('Mean pairwise variation') +
    ylab(expression(paste("Spearman's ", rho))) +
    scale_x_log10()+
    theme_bw()
