In [1]:
from tqdm import tqdm
import pdb
import limix
import pickle as pkl
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from numpy import asarray
import xarray as xr
from copy import deepcopy
import os
from os.path import join
import simplejson as json
import hashlib
import pandas as pd
import pdb
from glob import glob
import h5py
import re
from plot import plot_poisson_ordered, plot_distplots, plot_distplot
from util import get_amelie_selection
import h5py
import dask
import dask.dataframe
import dask.array
from urllib.request import urlopen
try:
    import ipfsapi
    has_ipfs = True
except ModuleNotFoundError:
    has_ipfs = False


if has_ipfs:
    try:
        ipfs = ipfsapi.connect('127.0.0.1', 5001)
    except Exception:
        has_ipfs = False

only_hash = True

  return f(*args, **kwds)


# Load data (Amelie's and Hannah's)

In [2]:
if os.path.exists("data/traits-kinship-hannah-amelie.pkl"):
    data = pkl.load(open("data/traits-kinship-hannah-amelie.pkl", "rb"))
else:
    url = "http://ipfs.io/ipfs/QmXtFuoA3JTkhhpUpqh4dyu3FJLUG7DWAmXwUk1N2TMbAh"
    data = pkl.load(urlopen(url))
    
dst_folder = "3"
if not os.path.exists(dst_folder):
    os.mkdir(dst_folder)
if not os.path.exists(join(dst_folder, 'phenotype')):
    os.mkdir(join(dst_folder, 'phenotype'))
if not os.path.exists(join(dst_folder, 'kinship')):
    os.mkdir(join(dst_folder, 'kinship'))

In [3]:
def isbernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    if len(u) == 2:
        return True
    return False

def get_bernoulli(x):
    x = np.asarray(x, float)
    u = np.unique(x)
    i0 = x == u[0]
    i1 = x == u[1]
    x[i0] = 0.0
    x[i1] = 1.0
    return x

def isdiscrete(x):
    x = np.asarray(x, float)
    ok = np.isfinite(x)
    return all(x[ok] == np.asarray(x[ok], int))

def get_poisson(x):
    x = np.asarray(x, float)
    mi = min(x)    
    if mi < 0:
        x += -mi
    return x

def isnumber(x):
    try:
        np.asarray(x, float)
    except ValueError:
        return False
    return True

In [4]:
def analysis_QTL(G, pheno, kinship, pheno_name, lik, pheno_norm):
    common_samples = set(kinship.index.values).intersection(set(pheno.index.values))
    common_samples = np.asarray(list(common_samples))
    
    pheno = pheno.reindex(common_samples).copy()
    kinship = kinship.reindex(index=common_samples).copy()
    kinship = kinship.reindex(columns=common_samples).copy()
    
    assert(all(kinship.index.values == kinship.columns.values))
    assert(all(kinship.index.values == pheno.index.values))

    ok = np.isfinite(pheno.loc[:, pheno_name])
    common_samples = common_samples[ok]
    
    pheno = pheno.reindex(common_samples).copy()
    kinship = kinship.reindex(index=common_samples).copy()
    kinship = kinship.reindex(columns=common_samples).copy()
    
    assert(all(kinship.index.values == kinship.columns.values))
    assert(all(kinship.index.values == pheno.index.values))
    
    if not all(np.isfinite(pheno[pheno_name])):
        raise ValueError("not all finite: {}".format(pheno_name))
    
    y = pheno_norm(pheno.loc[:, pheno_name])
    y = pd.Series(y, index=pheno.loc[:, pheno_name].index)
    
#     phenotype_series = pd.Series(y, pheno.loc[:, pheno_name].index.values)
    obj = pkl.dumps(y)
    m = hashlib.sha1()
    m.update(obj)
    phenotype_hex = m.hexdigest()
    with open(join(dst_folder, "phenotype", phenotype_hex + ".series.pkl"), "wb") as f:
        f.write(obj)
    
    obj = pkl.dumps(kinship)
    m = hashlib.sha1()
    m.update(obj)
    kinship_hex = m.hexdigest()
    with open(join(dst_folder, "kinship", kinship_hex + ".dataframe.pkl"), "wb") as f:
        f.write(obj)

    try:
        call = "limix.qtl.scan(G, y, lik, kinship, verbose=True)"
        model = limix.qtl.scan(G, y, lik, kinship, verbose=True)
    except ValueError as e:
        print(e)
        print("Pheno name: {}".format(pheno_name))
        return None

    return dict(pv=list(asarray(model.variant_pvalues)), lik=lik, call=call, phenotype=phenotype_hex, kinship=kinship_hex,
                null_covariate_effsizes=list(asarray(model.null_covariate_effsizes)),
                variant_effsizes=list(asarray(model.variant_effsizes)), variant_effsizes_se=list(asarray(model.variant_effsizes_se)))

In [5]:
f = h5py.File("arrayexpress/HS.hdf5", "r")
samples = np.asarray(f["/imputed_genotypes/row_header/rat"].value)
samples = [i.decode() for i in samples]
f.close()

In [6]:
data0 = deepcopy(data)
remove = []
for name in data0['measures'].columns.values:
    if isnumber(data0['measures'].loc[:, name]) and isdiscrete(data0['measures'].loc[:, name]):
        data0['measures'].loc[:, name] = get_poisson(data0['measures'].loc[:, name])
    else:
        remove.append(name)
for name in remove:
    del data0['measures'][name]

In [7]:
Gs = []
for i in tqdm(range(1, 22)):
    G = xr.open_dataarray("arrayexpress/HS.hdf5", "/imputed_genotypes/chr{}".format(i),
                          chunks=1000)
    G = G.rename({G.dims[0]: "snps", G.dims[1]: "samples"})
    Gs.append(G)

100%|██████████| 21/21 [00:01<00:00, 20.69it/s]


In [8]:
G = xr.concat(Gs, dim="snps").T
G['samples'] = samples
G['snps'] = range(G.shape[1])

In [9]:
# for name in data0['measures'].columns.values:
#     print("Processing: {}".format(name))
#     dst_file = join(dst_folder, "scan_measures_normal_" + name + "_mean_standardize.json")
#     if os.path.exists(dst_file) or os.path.exists(dst_file + ".failed"):
#         continue
#     r = analysis_QTL(G, data0['measures'], data0['kinship'], name, 'normal', limix.qc.mean_standardize)
#     if r is None:
#         open(dst_file + ".failed", "w").write("")
#     else:
#         json.dump(r, open(dst_file, "w"))

In [None]:
for name in data0['measures'].columns.values:
    print("Processing: {}".format(name))
    dst_file = join(dst_folder, "scan_measures_normal_" + name + "_quantile_gaussianize.json")
    if os.path.exists(dst_file) or os.path.exists(dst_file + ".failed"):
        continue
    r = analysis_QTL(G, data0['measures'], data0['kinship'], name, 'normal', limix.qc.quantile_gaussianize)
    if r is None:
        open(dst_file + ".failed", "w").write("")
    else:
        json.dump(r, open(dst_file, "w"))

Processing: Boli_tot


Normalising input... 

  return self.array[key]


done (5.73 seconds).


38it [00:00, 434.22it/s]

Scale: 1.14028
Delta: 0.5
Beta: [-0.]
Scale: 1.13957
Delta: 0.5
Beta: [-1.21886377]
Scale: 1.13957
Delta: 0.5
Beta: [-1.21886368]
Scale: 1.13957
Delta: 0.500001
Beta: [-1.21886348]
Scale: 1.13957
Delta: 0.500001
Beta: [-1.21886309]
Scale: 1.13957
Delta: 0.500002
Beta: [-1.2188623]
Scale: 1.13957
Delta: 0.500005
Beta: [-1.21886072]
Scale: 1.13956
Delta: 0.500009
Beta: [-1.21885758]
Scale: 1.13955
Delta: 0.500019
Beta: [-1.21885128]
Scale: 1.13953
Delta: 0.500038
Beta: [-1.21883869]
Scale: 1.13948
Delta: 0.500076
Beta: [-1.21881351]
Scale: 1.13939
Delta: 0.500153
Beta: [-1.21876314]
Scale: 1.1392
Delta: 0.500305
Beta: [-1.21866239]
Scale: 1.13883
Delta: 0.50061
Beta: [-1.21846088]
Scale: 1.1381
Delta: 0.501221
Beta: [-1.21805775]
Scale: 1.13663
Delta: 0.502441
Beta: [-1.21725111]
Scale: 1.13371
Delta: 0.504883
Beta: [-1.21563627]
Scale: 1.12794
Delta: 0.509765
Beta: [-1.21240039]
Scale: 1.11668
Delta: 0.519522
Beta: [-1.20590387]
Scale: 1.09526
Delta: 0.538985
Beta: [-1.19281264]
Scale: 


Scanning: 100%|██████████| 30/30 [18:22<00:00, 36.76s/it]


Processing: Distance0_30


Normalising input... 

  return self.array[key]


done (14.41 seconds).


4it [00:00, 32.00it/s]

Scale: 1.21987
Delta: 0.5
Beta: [-0.]
Scale: 1.21886
Delta: 0.5
Beta: [-1.64691082]
Scale: 1.21886
Delta: 0.5
Beta: [-1.64691063]
Scale: 1.21886
Delta: 0.500001
Beta: [-1.64691027]
Scale: 1.21886
Delta: 0.500001
Beta: [-1.64690954]
Scale: 1.21886
Delta: 0.500002
Beta: [-1.64690809]
Scale: 1.21886
Delta: 0.500005
Beta: [-1.64690517]
Scale: 1.21885
Delta: 0.500009
Beta: [-1.64689935]
Scale: 1.21884
Delta: 0.500019
Beta: [-1.6468877]
Scale: 1.21881
Delta: 0.500038
Beta: [-1.64686441]
Scale: 1.21876
Delta: 0.500076
Beta: [-1.64681782]
Scale: 1.21866
Delta: 0.500153
Beta: [-1.64672465]
Scale: 1.21846
Delta: 0.500305
Beta: [-1.6465383]
Scale: 1.21805
Delta: 0.50061
Beta: [-1.6461656]
Scale: 1.21724
Delta: 0.501221
Beta: [-1.6454202]
Scale: 1.21563
Delta: 0.502441
Beta: [-1.6439294]
Scale: 1.21243
Delta: 0.504883
Beta: [-1.64094777]
Scale: 1.20609
Delta: 0.509765
Beta: [-1.63498455]
Scale: 1.19375
Delta: 0.519522
Beta: [-1.62305903]
Scale: 1.17029
Delta: 0.538985
Beta: [-1.59921772]
Scale: 1.

36it [00:00, 165.81it/s]

Scale: 0.984795
Delta: 0.764658
Beta: [-0.97985761]
Scale: 0.984795
Delta: 0.764658
Beta: [-0.97985762]
Scale: 0.984795
Delta: 0.764658
Beta: [-0.97985727]
Scale: 0.984795
Delta: 0.764658
Beta: [-0.97985748]
Scale: 0.984795
Delta: 0.764658
Beta: [-0.97985757]



Scanning: 100%|██████████| 30/30 [44:43<00:00, 89.45s/it]


Processing: Distance10_15


Normalising input... done (17.16 seconds).


0it [00:00, ?it/s]

Scale: 1.25973
Delta: 0.5
Beta: [-0.]
Scale: 1.25917
Delta: 0.5
Beta: [-1.23497001]
Scale: 1.25917
Delta: 0.5
Beta: [-1.23496991]
Scale: 1.25917
Delta: 0.500001
Beta: [-1.23496972]
Scale: 1.25917
Delta: 0.500001
Beta: [-1.23496935]
Scale: 1.25917
Delta: 0.500002
Beta: [-1.23496859]
Scale: 1.25916
Delta: 0.500005
Beta: [-1.23496709]
Scale: 1.25916
Delta: 0.500009
Beta: [-1.23496407]
Scale: 1.25914
Delta: 0.500019
Beta: [-1.23495804]
Scale: 1.25912
Delta: 0.500038
Beta: [-1.23494597]
Scale: 1.25906
Delta: 0.500076
Beta: [-1.23492184]


46it [00:00, 145.91it/s]

Scale: 1.25896
Delta: 0.500153
Beta: [-1.23487357]
Scale: 1.25875
Delta: 0.500305
Beta: [-1.23477703]
Scale: 1.25832
Delta: 0.50061
Beta: [-1.23458388]
Scale: 1.25748
Delta: 0.501221
Beta: [-1.2341974]
Scale: 1.2558
Delta: 0.502441
Beta: [-1.23342365]
Scale: 1.25245
Delta: 0.504883
Beta: [-1.231873]
Scale: 1.24585
Delta: 0.509765
Beta: [-1.22875927]
Scale: 1.23297
Delta: 0.519522
Beta: [-1.22248304]
Scale: 1.2085
Delta: 0.538985
Beta: [-1.20974337]
Scale: 1.16445
Delta: 0.5775
Beta: [-1.18357758]
Scale: 1.0938
Delta: 0.651363
Beta: [-1.12898869]
Scale: 1.00672
Delta: 0.777312
Beta: [-1.01442816]
Scale: 0.953772
Delta: 0.924152
Beta: [-0.78695983]
Scale: 0.973857
Delta: 0.849099
Beta: [-0.42708505]
Scale: 1.03312
Delta: 0.73328
Beta: [-0.63140358]
Scale: 0.997234
Delta: 0.795538
Beta: [-0.87174629]
Scale: 0.999404
Delta: 0.791416
Beta: [-0.74979463]
Scale: 0.999123
Delta: 0.791954
Beta: [-0.7583197]
Scale: 0.999147
Delta: 0.791908
Beta: [-0.75721155]
Scale: 0.999148
Delta: 0.791906
Beta


Scanning: 100%|██████████| 30/30 [39:54<00:00, 79.81s/it]


Processing: Distance15_20


Normalising input... done (8.8 seconds).


13it [00:00, 86.63it/s]

Scale: 1.27544
Delta: 0.5
Beta: [-0.]
Scale: 1.27509
Delta: 0.5
Beta: [-0.97206538]
Scale: 1.27509
Delta: 0.5
Beta: [-0.97206524]
Scale: 1.27509
Delta: 0.500001
Beta: [-0.97206496]
Scale: 1.27509
Delta: 0.500001
Beta: [-0.9720644]
Scale: 1.27509
Delta: 0.500002
Beta: [-0.97206329]
Scale: 1.27508
Delta: 0.500005
Beta: [-0.97206107]
Scale: 1.27508
Delta: 0.500009
Beta: [-0.97205662]
Scale: 1.27506
Delta: 0.500019
Beta: [-0.97204773]
Scale: 1.27504
Delta: 0.500038
Beta: [-0.97202994]
Scale: 1.27498
Delta: 0.500076
Beta: [-0.97199436]
Scale: 1.27487
Delta: 0.500153
Beta: [-0.9719232]
Scale: 1.27465
Delta: 0.500305
Beta: [-0.9717809]
Scale: 1.2742
Delta: 0.50061
Beta: [-0.97149633]
Scale: 1.27332
Delta: 0.501221
Beta: [-0.97092735]
Scale: 1.27156
Delta: 0.502441
Beta: [-0.96978996]
Scale: 1.26805
Delta: 0.504883
Beta: [-0.96751754]
Scale: 1.26113
Delta: 0.509765
Beta: [-0.96298214]
Scale: 1.24762
Delta: 0.519522
Beta: [-0.95394948]
Scale: 1.22192
Delta: 0.538985
Beta: [-0.9360394]
Scale: 1.

39it [00:00, 147.49it/s]

Beta: [-0.51034246]
Scale: 0.971589
Delta: 0.849099
Beta: [-0.27146319]
Scale: 0.959022
Delta: 0.883155
Beta: [-0.39808125]
Scale: 0.973719
Delta: 0.843991
Beta: [-0.34295142]
Scale: 0.975154
Delta: 0.840625
Beta: [-0.40618481]
Scale: 0.9749
Delta: 0.841202
Beta: [-0.41150775]
Scale: 0.974892
Delta: 0.84122
Beta: [-0.41059695]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41056776]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057054]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057052]
Scale: 0.974892
Delta: 0.841219
Beta: [-0.41057051]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41056946]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057011]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057036]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057046]
Scale: 0.974893
Delta: 0.841219
Beta: [-0.41057049]



Scanning: 100%|██████████| 30/30 [36:34<00:00, 73.16s/it]


Processing: Distance20_25
Processing: Distance25
Processing: Distance25_30
Processing: Distance5


Normalising input... done (11.87 seconds).


0it [00:00, ?it/s]

Scale: 1.22761
Delta: 0.5
Beta: [-0.]
Scale: 1.22696
Delta: 0.5
Beta: [-1.32357339]


12it [00:00, 84.99it/s]

Scale: 1.22696
Delta: 0.5
Beta: [-1.32357319]
Scale: 1.22696
Delta: 0.500001
Beta: [-1.32357278]
Scale: 1.22696
Delta: 0.500001
Beta: [-1.32357197]
Scale: 1.22695
Delta: 0.500002
Beta: [-1.32357034]
Scale: 1.22695
Delta: 0.500005
Beta: [-1.32356708]
Scale: 1.22695
Delta: 0.500009
Beta: [-1.32356057]
Scale: 1.22693
Delta: 0.500019
Beta: [-1.32354755]
Scale: 1.22691
Delta: 0.500038
Beta: [-1.3235215]
Scale: 1.22686
Delta: 0.500076
Beta: [-1.32346941]
Scale: 1.22675
Delta: 0.500153
Beta: [-1.32336522]
Scale: 1.22655
Delta: 0.500305
Beta: [-1.32315686]
Scale: 1.22614
Delta: 0.50061
Beta: [-1.32274018]
Scale: 1.22531
Delta: 0.501221
Beta: [-1.32190698]
Scale: 1.22368
Delta: 0.502441
Beta: [-1.32024122]
Scale: 1.22042
Delta: 0.504883
Beta: [-1.31691221]
Scale: 1.214
Delta: 0.509765
Beta: [-1.31026443]
Scale: 1.20147
Delta: 0.519522
Beta: [-1.29701033]
Scale: 1.17769
Delta: 0.538985
Beta: [-1.27067259]
Scale: 1.1349
Delta: 0.5775
Beta: [-1.21871488]
Scale: 1.06644
Delta: 0.651363
Beta: [-1.11

36it [00:00, 174.77it/s]

Scale: 0.987048
Delta: 0.769676
Beta: [-0.64223275]



Scanning: 100%|██████████| 30/30 [43:22<00:00, 86.76s/it]


Processing: Distance5_10


Normalising input... done (13.69 seconds).


35it [00:00, 196.73it/s]

Scale: 1.29232
Delta: 0.5
Beta: [-0.]
Scale: 1.2918
Delta: 0.5
Beta: [-1.18181643]
Scale: 1.2918
Delta: 0.5
Beta: [-1.18181635]
Scale: 1.2918
Delta: 0.500001
Beta: [-1.18181618]
Scale: 1.2918
Delta: 0.500001
Beta: [-1.18181586]
Scale: 1.2918
Delta: 0.500002
Beta: [-1.18181522]
Scale: 1.2918
Delta: 0.500005
Beta: [-1.18181393]
Scale: 1.29179
Delta: 0.500009
Beta: [-1.18181135]
Scale: 1.29178
Delta: 0.500019
Beta: [-1.18180619]
Scale: 1.29175
Delta: 0.500038
Beta: [-1.18179587]
Scale: 1.29169
Delta: 0.500076
Beta: [-1.18177524]
Scale: 1.29158
Delta: 0.500153
Beta: [-1.18173397]
Scale: 1.29136
Delta: 0.500305
Beta: [-1.18165141]
Scale: 1.29092
Delta: 0.50061
Beta: [-1.18148627]
Scale: 1.29004
Delta: 0.501221
Beta: [-1.18115583]
Scale: 1.28829
Delta: 0.502441
Beta: [-1.18049434]
Scale: 1.2848
Delta: 0.504883
Beta: [-1.17916894]
Scale: 1.27791
Delta: 0.509765
Beta: [-1.17650853]
Scale: 1.26446
Delta: 0.519522
Beta: [-1.17114967]
Scale: 1.23888
Delta: 0.538985
Beta: [-1.16028327]
Scale: 1.19


Scanning: 100%|██████████| 30/30 [46:57<00:00, 93.92s/it]


Processing: Rearing0_30


Normalising input... done (10.29 seconds).


10it [00:00, 61.88it/s]

Scale: 1.19793
Delta: 0.5
Beta: [-0.]
Scale: 1.19755
Delta: 0.5
Beta: [-1.01495304]
Scale: 1.19755
Delta: 0.5
Beta: [-1.01495285]
Scale: 1.19755
Delta: 0.500001
Beta: [-1.01495246]
Scale: 1.19755
Delta: 0.500001
Beta: [-1.01495169]
Scale: 1.19755
Delta: 0.500002
Beta: [-1.01495014]
Scale: 1.19755
Delta: 0.500005
Beta: [-1.01494704]
Scale: 1.19754
Delta: 0.500009
Beta: [-1.01494083]
Scale: 1.19753
Delta: 0.500019
Beta: [-1.01492843]
Scale: 1.1975
Delta: 0.500038
Beta: [-1.01490363]
Scale: 1.19745
Delta: 0.500076
Beta: [-1.01485402]
Scale: 1.19735
Delta: 0.500153
Beta: [-1.01475482]
Scale: 1.19715
Delta: 0.500305
Beta: [-1.01455644]
Scale: 1.19674
Delta: 0.50061
Beta: [-1.01415978]
Scale: 1.19593
Delta: 0.501221
Beta: [-1.01336691]
Scale: 1.19431
Delta: 0.502441
Beta: [-1.01178292]
Scale: 1.1911
Delta: 0.504883
Beta: [-1.00862192]
Scale: 1.18475
Delta: 0.509765
Beta: [-1.00232786]
Scale: 1.17238
Delta: 0.519522
Beta: [-0.98985132]
Scale: 1.14886
Delta: 0.538985
Beta: [-0.96534333]
Scale:

37it [00:00, 170.83it/s]


Scale: 0.949207
Delta: 0.789542
Beta: [-0.45713023]
Scale: 0.949207
Delta: 0.789542
Beta: [-0.45713084]
Scale: 0.949207
Delta: 0.789541
Beta: [-0.45713084]
Scale: 0.949207
Delta: 0.789541
Beta: [-0.45713096]
Scale: 0.949207
Delta: 0.789542
Beta: [-0.45713089]
Scale: 0.949207
Delta: 0.789542
Beta: [-0.45713086]
Scale: 0.949207
Delta: 0.789542
Beta: [-0.45713087]



Scanning:   7%|▋         | 2/30 [02:52<40:20, 86.44s/it]

In [None]:
f.close()