In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as spstats
from skbio import DistanceMatrix
import json
from glob import glob
import rpy2
import pandas as pd
from collections import defaultdict


%matplotlib inline
%load_ext rpy2.ipython
#%config InlineBackend.figure_format = 'svg'
%config InlineBackend.rc = {'font.size': 10, 'figure.figsize': (8.0, 8.0), 'figure.facecolor': 'white', 'savefig.dpi': 72, 'figure.subplot.bottom': 0.125, 'figure.edgecolor': 'white'}

with open("sample-run.json") as fh:
    samples = json.load(fh)

In [None]:
def make_trumat():
    arr = np.zeros((96, 96))
    for x in range(2):
        for y in range(2):
            for j in range(x * 48, (x+1) * 48):
                for k in range(y * 48, (y+1) * 48):
                    if x == y:
                        arr[j, k] = 2
                    else:
                        arr[j, k] = 4
    for i in range(16):
        st = i * 6
        sp = st + 6
        for j in range(st, sp):
            for k in range(st, sp):
                if j == k:
                    arr[j, k] = 0
                else:
                    arr[j, k] = 1
    return arr

In [None]:
plt.imshow(make_trumat(), interpolation='none')
truth = DistanceMatrix(make_trumat()).condensed_form()

In [None]:
def reorder_matrix(mat, metad):
    ids = mat.ids
    g2s = defaultdict(list)
    for group, samples in metad.items():
        for sample in samples.values():
            for run in sample:
                g2s[group].append(run)
    neworder = []
    for grp in ['Indica', 'Japonica']:
        neworder.extend(list(sorted(g2s[grp])))
    assert(set(neworder) == set(ids))
    return mat.filter(neworder)

In [None]:
scores = []

for i in range(2, 102):
    wipf = "kwip/3krice_set_{:03d}_wip.dist".format(i)
    ipf = "kwip/3krice_set_{:03d}_ip.dist".format(i)
    mdf = "metadata/3krice_set_{:03d}.txt.json".format(i)
    try:
        wip =  DistanceMatrix.read(wipf)
        ip =  DistanceMatrix.read(ipf)
        with open(mdf) as fh:
            metad = json.load(fh)
    except Exception as e:
        print(str(e))
        print("skipping *{:03d}*".format(i))
        continue
    wip = reorder_matrix(wip, metad)
    ip = reorder_matrix(ip, metad)
    if i <= 10:
        wip.plot()
    wipr, _ =  spstats.pearsonr(truth, wip.condensed_form())
    ipr, _ =  spstats.pearsonr(truth, ip.condensed_form())
    wips, _ =  spstats.spearmanr(truth, wip.condensed_form())
    ips, _ =  spstats.spearmanr(truth, ip.condensed_form())
    scores.append((wipr, ipr, wips, ips))

In [None]:
scoremat = pd.DataFrame(np.array(scores), columns=('WIPpearson', "IPpearson", "WIP", "IP"))

In [None]:
%%R -i scoremat
library(tidyr)
library(dplyr)
library(ggplot2)
library(reshape2)
summary(scoremat)

scoremat = scoremat %>%
           select(WIP, IP)

sm.melt = melt(scoremat, value.name="r", variable.name='Metric')
print(summary(sm.melt))

t.test(scoremat$WIP, scoremat$IP, paired=T)

In [None]:
%%R

p = ggplot(sm.melt, aes(x=Metric, y=r)) +
    geom_violin(aes(fill=Metric)) +
    ylab("Spearman's rho") +
    ylim(0, 1) +
    theme_bw()

print(p)
pdf("replicate-correlation.pdf", width=3, height=4)
print(p)
dev.off()