In [1]:
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from tqdm import tqdm
import pdb
import limix
import pickle as pkl
import numpy as np
import seaborn as sns
from numpy import asarray
import xarray as xr
from copy import deepcopy
import os
from os.path import join
import simplejson as json
import hashlib
import pandas as pd
import pdb
from glob import glob
from joblib import Parallel, delayed
import h5py
import re
from plot import plot_poisson_ordered, plot_distplots, plot_distplot
from util import get_amelie_selection
import h5py
import dask
import dask.dataframe
import dask.array
from urllib.request import urlopen
import gzip
try:
    import ipfsapi
    has_ipfs = True
except ModuleNotFoundError:
    has_ipfs = False


if has_ipfs:
    try:
        ipfs = ipfsapi.connect('127.0.0.1', 5001)
    except Exception:
        has_ipfs = False

only_hash = True

# Load data (Amelie's and Hannah's)

In [2]:
if os.path.exists("data/traits-kinship-hannah-amelie.pkl"):
    data = pkl.load(open("data/traits-kinship-hannah-amelie.pkl", "rb"))
else:
    url = "http://ipfs.io/ipfs/QmXtFuoA3JTkhhpUpqh4dyu3FJLUG7DWAmXwUk1N2TMbAh"
    data = pkl.load(urlopen(url))
    
dst_folder = "3"
if not os.path.exists(dst_folder):
    os.mkdir(dst_folder)
if not os.path.exists(join(dst_folder, 'phenotype')):
    os.mkdir(join(dst_folder, 'phenotype'))
if not os.path.exists(join(dst_folder, 'kinship')):
    os.mkdir(join(dst_folder, 'kinship'))

In [3]:
def isbernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    if len(u) == 2:
        return True
    return False

def get_bernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    i0 = x == u[0]
    i1 = x == u[1]
    x[i0] = 0.0
    x[i1] = 1.0
    return x

def isdiscrete(x):
    x = np.asarray(x, float)
    ok = np.isfinite(x)
    return all(x[ok] == np.asarray(x[ok], int))

def get_poisson(x):
    x = np.asarray(x, float)
    mi = min(x)    
    if mi < 0:
        x += -mi
    return x

def isnumber(x):
    try:
        np.asarray(x, float)
    except ValueError:
        return False
    return True

In [4]:
f = h5py.File("arrayexpress/HS.hdf5", "r")
samples = np.asarray(f["/imputed_genotypes/row_header/rat"].value)
samples = [i.decode() for i in samples]
f.close()

In [5]:
data0 = deepcopy(data)
remove = []
for name in data0['measures'].columns.values:
    if isnumber(data0['measures'].loc[:, name]) and isdiscrete(data0['measures'].loc[:, name]):
        data0['measures'].loc[:, name] = get_poisson(data0['measures'].loc[:, name])
    else:
        remove.append(name)
for name in remove:
    del data0['measures'][name]

In [6]:
f = h5py.File("arrayexpress/HS.hdf5", "r")
pos = []
chrs = []
for i in tqdm(range(1, 22)):
    p = f["/imputed_genotypes/chr{}/col_header/pos".format(i)].value
    pos.append(p)
    chrs.append([i] * len(p))
f.close()

pos = np.concatenate(pos).astype(float)
chrs = np.concatenate(chrs).astype(int)

100%|██████████| 21/21 [00:03<00:00,  6.82it/s]


In [7]:
amelie_selection = get_amelie_selection()

In [None]:
# patts = ["null_measures_normal_(.*)_quantile_gaussianize.json.pkl",
#          "null_measures_normal_(.*)_mean_standardize.json.pkl",
#          "null_ipheno_normal_(.*)_mean_standardize.json.pkl",
#          "null_ipheno_normal_(.*)_quantile_gaussianize.json.pkl",
#          "null_measures_poisson_(.*).json.pkl"]

patts = ["null_measures_normal_(.*)_quantile_gaussianize.json.pkl",
         "null_measures_poisson_(.*).json.pkl"]

patt2name = {'null_ipheno_normal_(.*)_mean_standardize.json.pkl':'INormalStd',
             'null_ipheno_normal_(.*)_quantile_gaussianize.json.pkl':'INormalGau',
             'null_measures_poisson_(.*).json.pkl':'MPoisson',
             'null_measures_normal_(.*)_mean_standardize.json.pkl':'MNormalStd',
             'null_measures_normal_(.*)_quantile_gaussianize.json.pkl':'MNormalGau'}

def plot_this(d):
    trait = d[0]["trait"]
    pos = list(d[0]["pos"]) * 5
    chrs = list(d[0]["chrs"]) * 5
    amelie_selection = d[0]["amelie_selection"]
    
    data = {}
    for i, di in enumerate(d):
        with gzip.open(di["path"], 'rb') as f:
            data[di["model"]] = pkl.load(f)
    
    for i, di in enumerate(d):
        alpha = 0.01
        pv = data[di["model"]]["pv"]
        if len(pv) != len(pos):
            return None
        trait = di["trait"]
        model = di["model"]
        dataframe = {"pv": pv, "pos": pos, "chr": chrs}
        
        df = pd.DataFrame(data=dataframe)
        plt.figure(figsize=(6, 6))
        limix.plot.qqplot(df["pv"])
        ax = plt.gca()
        if trait in amelie_selection:
            ax.set_title(f"{trait} - {model}", color='red')
        else:
            ax.set_title(f"{trait} - {model}")
        folder = f"3/fig/null/{model}/{trait}"
        try:
            os.makedirs(folder)
        except FileExistsError:
            pass
        plt.savefig(folder + "/qqplot.png", bbox_inches='tight')
        plt.close()

args = []
data = dict()
for path in glob(join(dst_folder, "*.json.pkl.gz")):
    filename = path.split("/")[1]
    for patt in patts:
        match = re.match(patt, filename)
        if match:
            trait = match.groups(0)[0]
            model = patt2name[patt]
            if trait not in data:
                data[trait] = []

            data[trait].append({"trait": trait, "model": model, "path": path, "pos": pos,
                                "chrs":chrs, "amelie_selection":amelie_selection})

# N = int(sys.argv[1])
# seed = int(sys.argv[2])

# for (i, a) in enumerate(list(data.values())):
#     if i % N == seed:
#         plot_this(a)
_ = Parallel(n_jobs=10, verbose=50, backend="multiprocessing")(delayed(plot_this)(a) for a in list(data.values()))

[Parallel(n_jobs=10)]: Using backend MultiprocessingBackend with 10 concurrent workers.
