In [1]:
from tqdm import tqdm
import pdb
import limix
import pickle as pkl
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from numpy import asarray
import xarray as xr
from copy import deepcopy
import os
from os.path import join
import simplejson as json
import hashlib
import pandas as pd
import pdb
from glob import glob
import h5py
import re
from plot import plot_poisson_ordered, plot_distplots, plot_distplot
from util import get_amelie_selection
import h5py
import dask
import dask.dataframe
import dask.array
from urllib.request import urlopen
try:
    import ipfsapi
    has_ipfs = True
except ModuleNotFoundError:
    has_ipfs = False


if has_ipfs:
    try:
        ipfs = ipfsapi.connect('127.0.0.1', 5001)
    except Exception:
        has_ipfs = False

only_hash = True

  return f(*args, **kwds)


# Load data (Amelie's and Hannah's)

In [2]:
if os.path.exists("data/traits-kinship-hannah-amelie.pkl"):
    data = pkl.load(open("data/traits-kinship-hannah-amelie.pkl", "rb"))
else:
    url = "http://ipfs.io/ipfs/QmXtFuoA3JTkhhpUpqh4dyu3FJLUG7DWAmXwUk1N2TMbAh"
    data = pkl.load(urlopen(url))
    
dst_folder = "3"
if not os.path.exists(dst_folder):
    os.mkdir(dst_folder)
if not os.path.exists(join(dst_folder, 'phenotype')):
    os.mkdir(join(dst_folder, 'phenotype'))
if not os.path.exists(join(dst_folder, 'kinship')):
    os.mkdir(join(dst_folder, 'kinship'))

In [3]:
def isbernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    if len(u) == 2:
        return True
    return False

def get_bernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    i0 = x == u[0]
    i1 = x == u[1]
    x[i0] = 0.0
    x[i1] = 1.0
    return x

def isdiscrete(x):
    x = np.asarray(x, float)
    ok = np.isfinite(x)
    return all(x[ok] == np.asarray(x[ok], int))

def get_poisson(x):
    x = np.asarray(x, float)
    mi = min(x)    
    if mi < 0:
        x += -mi
    return x

def isnumber(x):
    try:
        np.asarray(x, float)
    except ValueError:
        return False
    return True

In [4]:
def analysis_QTL(G, pheno, kinship, pheno_name, lik, pheno_norm):
    common_samples = set(kinship.index.values).intersection(set(pheno.index.values))
    common_samples = np.asarray(list(common_samples))
    
    pheno = pheno.reindex(common_samples).copy()
    kinship = kinship.reindex(index=common_samples).copy()
    kinship = kinship.reindex(columns=common_samples).copy()
    
    assert(all(kinship.index.values == kinship.columns.values))
    assert(all(kinship.index.values == pheno.index.values))

    ok = np.isfinite(pheno.loc[:, pheno_name])
    common_samples = common_samples[ok]
    
    pheno = pheno.reindex(common_samples).copy()
    kinship = kinship.reindex(index=common_samples).copy()
    kinship = kinship.reindex(columns=common_samples).copy()
    
    assert(all(kinship.index.values == kinship.columns.values))
    assert(all(kinship.index.values == pheno.index.values))
    
    if not all(np.isfinite(pheno[pheno_name])):
        raise ValueError("not all finite: {}".format(pheno_name))
    
    y = pheno_norm(pheno.loc[:, pheno_name])
    y = pd.Series(y, index=pheno.loc[:, pheno_name].index)
    
#     phenotype_series = pd.Series(y, pheno.loc[:, pheno_name].index.values)
    obj = pkl.dumps(y)
    m = hashlib.sha1()
    m.update(obj)
    phenotype_hex = m.hexdigest()
    with open(join(dst_folder, "phenotype", phenotype_hex + ".series.pkl"), "wb") as f:
        f.write(obj)
    
    obj = pkl.dumps(kinship)
    m = hashlib.sha1()
    m.update(obj)
    kinship_hex = m.hexdigest()
    with open(join(dst_folder, "kinship", kinship_hex + ".dataframe.pkl"), "wb") as f:
        f.write(obj)

    try:
        call = "limix.qtl.scan(G, y, lik, kinship, verbose=True)"
        model = limix.qtl.scan(G, y, lik, kinship, verbose=True)
    except ValueError as e:
        print(e)
        print("Pheno name: {}".format(pheno_name))
        return None

    return dict(pv=list(asarray(model.variant_pvalues)), lik=lik, call=call, phenotype=phenotype_hex, kinship=kinship_hex,
                null_covariate_effsizes=list(asarray(model.null_covariate_effsizes)),
                variant_effsizes=list(asarray(model.variant_effsizes)), variant_effsizes_se=list(asarray(model.variant_effsizes_se)))

In [5]:
f = h5py.File("arrayexpress/HS.hdf5", "r")
samples = np.asarray(f["/imputed_genotypes/row_header/rat"].value)
samples = [i.decode() for i in samples]
f.close()

In [6]:
data0 = deepcopy(data)
remove = []
for name in data0['measures'].columns.values:
    if isnumber(data0['measures'].loc[:, name]) and isdiscrete(data0['measures'].loc[:, name]):
        data0['measures'].loc[:, name] = get_poisson(data0['measures'].loc[:, name])
    else:
        remove.append(name)
for name in remove:
    del data0['measures'][name]

In [7]:
Gs = []
for i in tqdm(range(1, 22)):
    G = xr.open_dataarray("arrayexpress/HS.hdf5", "/imputed_genotypes/chr{}".format(i),
                          chunks=1000)
    G = G.rename({G.dims[0]: "snps", G.dims[1]: "samples"})
    Gs.append(G)

100%|██████████| 21/21 [00:00<00:00, 53.35it/s]


In [8]:
G = xr.concat(Gs, dim="snps").T
G['samples'] = samples
G['snps'] = range(G.shape[1])

In [None]:
for name in data0['measures'].columns.values:
    print("Processing: {}".format(name))
    dst_file = join(dst_folder, "scan_measures_normal_" + name + "_mean_standardize.json")
    if os.path.exists(dst_file) or os.path.exists(dst_file + ".failed"):
        continue
    r = analysis_QTL(G, data0['measures'], data0['kinship'], name, 'normal', limix.qc.mean_standardize)
    if r is None:
        open(dst_file + ".failed", "w").write("")
    else:
        json.dump(r, open(dst_file, "w"))

Processing: Boli_tot
Processing: Distance0_30
Processing: Distance10_15
Processing: Distance15_20
Processing: Distance20_25


Normalising input... 

  return self.array[key]


done (6.15 seconds).


42it [00:00, 347.18it/s]

Scale: 1.38772
Delta: 0.5
Beta: [-0.]
Scale: 1.38766
Delta: 0.5
Beta: [-0.41410821]
Scale: 1.38766
Delta: 0.5
Beta: [-0.41410817]
Scale: 1.38766
Delta: 0.500001
Beta: [-0.41410807]
Scale: 1.38766
Delta: 0.500001
Beta: [-0.41410789]
Scale: 1.38766
Delta: 0.500002
Beta: [-0.41410751]
Scale: 1.38765
Delta: 0.500005
Beta: [-0.41410677]
Scale: 1.38765
Delta: 0.500009
Beta: [-0.41410527]
Scale: 1.38763
Delta: 0.500019
Beta: [-0.41410229]
Scale: 1.3876
Delta: 0.500038
Beta: [-0.41409631]
Scale: 1.38754
Delta: 0.500076
Beta: [-0.41408437]
Scale: 1.38741
Delta: 0.500153
Beta: [-0.41406047]
Scale: 1.38716
Delta: 0.500305
Beta: [-0.41401267]
Scale: 1.38666
Delta: 0.50061
Beta: [-0.41391705]
Scale: 1.38565
Delta: 0.501221
Beta: [-0.4137257]
Scale: 1.38365
Delta: 0.502441
Beta: [-0.41334254]
Scale: 1.37968
Delta: 0.504883
Beta: [-0.41257444]
Scale: 1.37182
Delta: 0.509765
Beta: [-0.41103125]
Scale: 1.35649
Delta: 0.519522
Beta: [-0.40791776]
Scale: 1.3273
Delta: 0.538985
Beta: [-0.40158921]
Scale: 


Scanning: 100%|██████████| 30/30 [36:31<00:00, 73.06s/it]


Processing: Distance25


Normalising input... 

  return self.array[key]


done (13.86 seconds).


9it [00:00, 64.21it/s]

Scale: 1.34649
Delta: 0.5
Beta: [-0.]
Scale: 1.34621
Delta: 0.5
Beta: [-0.86950692]
Scale: 1.34621
Delta: 0.5
Beta: [-0.86950683]
Scale: 1.34621
Delta: 0.500001
Beta: [-0.86950666]
Scale: 1.34621
Delta: 0.500001
Beta: [-0.86950631]
Scale: 1.34621
Delta: 0.500002
Beta: [-0.86950561]
Scale: 1.3462
Delta: 0.500005
Beta: [-0.86950422]
Scale: 1.3462
Delta: 0.500009
Beta: [-0.86950143]
Scale: 1.34618
Delta: 0.500019
Beta: [-0.86949585]
Scale: 1.34615
Delta: 0.500038
Beta: [-0.8694847]
Scale: 1.34609
Delta: 0.500076
Beta: [-0.8694624]
Scale: 1.34597
Delta: 0.500153
Beta: [-0.86941778]
Scale: 1.34574
Delta: 0.500305
Beta: [-0.86932855]
Scale: 1.34526
Delta: 0.50061
Beta: [-0.86915004]
Scale: 1.34431
Delta: 0.501221
Beta: [-0.86879286]
Scale: 1.34242
Delta: 0.502441
Beta: [-0.86807788]
Scale: 1.33866
Delta: 0.504883
Beta: [-0.86664541]
Scale: 1.33124
Delta: 0.509765
Beta: [-0.86377055]
Scale: 1.31675
Delta: 0.519522
Beta: [-0.85798216]
Scale: 1.28921
Delta: 0.538985
Beta: [-0.8462582]
Scale: 1.

37it [00:00, 181.38it/s]
Scanning: 100%|██████████| 30/30 [45:29<00:00, 90.99s/it]


Processing: Distance25_30


Normalising input... done (8.91 seconds).


48it [00:00, 338.18it/s]

Scale: 1.40729
Delta: 0.5
Beta: [-0.]
Scale: 1.40712
Delta: 0.5
Beta: [-0.66470104]
Scale: 1.40712
Delta: 0.5
Beta: [-0.66470096]
Scale: 1.40712
Delta: 0.500001
Beta: [-0.6647008]
Scale: 1.40712
Delta: 0.500001
Beta: [-0.66470049]
Scale: 1.40712
Delta: 0.500002
Beta: [-0.66469986]
Scale: 1.40712
Delta: 0.500005
Beta: [-0.66469859]
Scale: 1.40711
Delta: 0.500009
Beta: [-0.66469607]
Scale: 1.40709
Delta: 0.500019
Beta: [-0.66469101]
Scale: 1.40706
Delta: 0.500038
Beta: [-0.66468091]
Scale: 1.407
Delta: 0.500076
Beta: [-0.66466069]
Scale: 1.40687
Delta: 0.500153
Beta: [-0.66462026]
Scale: 1.40661
Delta: 0.500305
Beta: [-0.66453939]
Scale: 1.4061
Delta: 0.50061
Beta: [-0.66437761]
Scale: 1.40507
Delta: 0.501221
Beta: [-0.66405391]
Scale: 1.40303
Delta: 0.502441
Beta: [-0.66340591]
Scale: 1.39897
Delta: 0.504883
Beta: [-0.66210754]
Scale: 1.39095
Delta: 0.509765
Beta: [-0.65950147]
Scale: 1.37529
Delta: 0.519522
Beta: [-0.65425302]
Scale: 1.34548
Delta: 0.538985
Beta: [-0.64361881]
Scale: 1


Scanning: 100%|██████████| 30/30 [34:08<00:00, 68.27s/it]


Processing: Distance5


Normalising input... 

  return self.array[key]


done (6.95 seconds).


38it [00:00, 488.25it/s]

Scale: 1.27776
Delta: 0.5
Beta: [-0.]
Scale: 1.27723
Delta: 0.5
Beta: [-1.19937666]
Scale: 1.27723
Delta: 0.5
Beta: [-1.19937649]
Scale: 1.27723
Delta: 0.500001
Beta: [-1.19937616]
Scale: 1.27723
Delta: 0.500001
Beta: [-1.19937548]
Scale: 1.27723
Delta: 0.500002
Beta: [-1.19937413]
Scale: 1.27722
Delta: 0.500005
Beta: [-1.19937143]
Scale: 1.27721
Delta: 0.500009
Beta: [-1.19936603]
Scale: 1.2772
Delta: 0.500019
Beta: [-1.19935524]
Scale: 1.27717
Delta: 0.500038
Beta: [-1.19933364]
Scale: 1.27712
Delta: 0.500076
Beta: [-1.19929046]
Scale: 1.27701
Delta: 0.500153
Beta: [-1.19920408]
Scale: 1.27679
Delta: 0.500305
Beta: [-1.19903134]
Scale: 1.27635
Delta: 0.50061
Beta: [-1.19868586]
Scale: 1.27547
Delta: 0.501221
Beta: [-1.19799494]
Scale: 1.27373
Delta: 0.502441
Beta: [-1.19661325]
Scale: 1.27025
Delta: 0.504883
Beta: [-1.1938505]
Scale: 1.26339
Delta: 0.509765
Beta: [-1.18832761]
Scale: 1.25002
Delta: 0.519522
Beta: [-1.17729307]
Scale: 1.2246
Delta: 0.538985
Beta: [-1.15527515]
Scale: 


Scanning: 100%|██████████| 30/30 [33:33<00:00, 67.13s/it]


Processing: Distance5_10
Processing: Rearing0_30
Processing: Rearing10_15
Processing: Rearing15_20
Processing: Rearing20_25
Processing: Rearing25
Processing: Rearing5
Processing: Rearing5_10
Processing: ALP


Normalising input... 

  return self.array[key]


done (8.08 seconds).


33it [00:00, 367.96it/s]

Scale: 1.1105
Delta: 0.5
Beta: [-0.]
Scale: 1.1105
Delta: 0.5
Beta: [-0.01661783]
Scale: 1.1105
Delta: 0.5
Beta: [-0.01661782]
Scale: 1.1105
Delta: 0.500001
Beta: [-0.0166178]
Scale: 1.1105
Delta: 0.500001
Beta: [-0.01661777]
Scale: 1.1105
Delta: 0.500002
Beta: [-0.01661772]
Scale: 1.11049
Delta: 0.500005
Beta: [-0.0166176]
Scale: 1.11049
Delta: 0.500009
Beta: [-0.01661737]
Scale: 1.11048
Delta: 0.500019
Beta: [-0.01661691]
Scale: 1.11046
Delta: 0.500038
Beta: [-0.01661598]
Scale: 1.11042
Delta: 0.500076
Beta: [-0.01661413]
Scale: 1.11033
Delta: 0.500153
Beta: [-0.01661043]
Scale: 1.11017
Delta: 0.500305
Beta: [-0.01660302]
Scale: 1.10984
Delta: 0.50061
Beta: [-0.01658818]
Scale: 1.10918
Delta: 0.501221
Beta: [-0.01655846]
Scale: 1.10787
Delta: 0.502441
Beta: [-0.01649876]
Scale: 1.10526
Delta: 0.504883
Beta: [-0.0163784]
Scale: 1.1001
Delta: 0.509765
Beta: [-0.01613385]
Scale: 1.09007
Delta: 0.519522
Beta: [-0.0156294]
Scale: 1.07105
Delta: 0.538985
Beta: [-0.01455944]
Scale: 1.03701



Scanning: 100%|██████████| 30/30 [39:19<00:00, 78.66s/it]


Processing: Chloride
Processing: Sodium
Processing: BW_week9
Processing: Has1kidney
Processing: PLT
Processing: AA_IL_nb
Processing: AA_IL_score
Processing: AA_nb
Processing: AA_score
Processing: IL_nb
Processing: IL_score
Processing: Avoidances1_10
Processing: Avoidances1_20
Processing: Avoidances1_40


Normalising input... done (10.75 seconds).


0it [00:00, ?it/s]

Scale: 1.22413
Delta: 0.5
Beta: [-0.]
Scale: 1.2241
Delta: 0.5
Beta: [-0.30915243]
Scale: 1.2241
Delta: 0.5
Beta: [-0.30915232]
Scale: 1.2241
Delta: 0.500001
Beta: [-0.30915209]
Scale: 1.2241
Delta: 0.500001
Beta: [-0.30915163]
Scale: 1.2241
Delta: 0.500002
Beta: [-0.30915072]
Scale: 1.22409
Delta: 0.500005
Beta: [-0.30914889]
Scale: 1.22409
Delta: 0.500009
Beta: [-0.30914523]
Scale: 1.22407
Delta: 0.500019
Beta: [-0.30913792]
Scale: 1.22404
Delta: 0.500038
Beta: [-0.30912329]


33it [00:00, 417.02it/s]

Scale: 1.22399
Delta: 0.500076
Beta: [-0.30909403]
Scale: 1.22388
Delta: 0.500153
Beta: [-0.30903552]
Scale: 1.22367
Delta: 0.500305
Beta: [-0.30891853]
Scale: 1.22324
Delta: 0.50061
Beta: [-0.30868462]
Scale: 1.22238
Delta: 0.501221
Beta: [-0.30821716]
Scale: 1.22067
Delta: 0.502441
Beta: [-0.30728366]
Scale: 1.21726
Delta: 0.504883
Beta: [-0.30542232]
Scale: 1.21053
Delta: 0.509765
Beta: [-0.30172214]
Scale: 1.19741
Delta: 0.519522
Beta: [-0.29441127]
Scale: 1.17245
Delta: 0.538985
Beta: [-0.28014247]
Scale: 1.12741
Delta: 0.5775
Beta: [-0.25297726]
Scale: 1.05477
Delta: 0.651363
Beta: [-0.20382433]
Scale: 0.964133
Delta: 0.777312
Beta: [-0.12383683]
Scale: 0.910208
Delta: 0.924152
Beta: [-0.02047865]
Scale: 0.929212
Delta: 0.849099
Beta: [0.05604233]
Scale: 0.917638
Delta: 0.883155
Beta: [0.02312381]
Scale: 0.931343
Delta: 0.84379
Beta: [0.03992823]
Scale: 0.932111
Delta: 0.841922
Beta: [0.02026897]
Scale: 0.932011
Delta: 0.842165
Beta: [0.0192498]
Scale: 0.932008
Delta: 0.842171
Be


Scanning: 100%|██████████| 30/30 [43:45<00:00, 87.51s/it]


In [None]:
# for name in data0['measures'].columns.values:
#     print("Processing: {}".format(name))
#     dst_file = join(dst_folder, "scan_measures_normal_" + name + "_quantile_gaussianize.json")
#     if os.path.exists(dst_file) or os.path.exists(dst_file + ".failed"):
#         continue
#     r = analysis_QTL(G, data0['measures'], data0['kinship'], name, 'normal', limix.qc.quantile_gaussianize)
#     if r is None:
#         open(dst_file + ".failed", "w").write("")
#     else:
#         json.dump(r, open(dst_file, "w"))

In [None]:
f.close()