# Read/reshape the data 

In [1]:
import numpy as np
import numpy.random as nr

In [2]:
def read_geno_numpy(fname):
    '''Reads a geno file into a numpy matrix'''
    return np.genfromtxt(
        fname,
        dtype='uint8',      # read the data in as 1-byte integers
        delimiter=1,        # 1-byte width data
        missing_values=9,   # 9 indicates missing data
        usemask=True        # return a masked array
    )

In [3]:
X = read_geno_numpy('../../xpop-analysis/data/merge.22.geno')

In [4]:
M = X.shape[0]
print(M)

3728


In [5]:
N_CARe  = 8367
N_WTCCC = 16179

Ns = [N_CARe, N_WTCCC]
N = sum(Ns)

print(Ns)

[8367, 16179]


In [6]:
X = (X[:,:N_CARe], X[:,N_CARe:])

In [7]:
P = [x.mean(axis=1).filled()/2 for x in X]

# Stability of $g$ or $\gamma$ estimates

* Sample $M_{causal}$ SNPs
* Generate $b$ or $\beta$
* Sample two sets of $N$ samples
* Generate phenotypes
* Calculate $\hat{g}$ or $\hat{\gamma}$
* Calculate $\rho \left(\hat{g}_1,\hat{g}_2\right)$

In [257]:
def BLUE(x, y):
    z = np.vstack([np.ones(x.shape[1]), x])
    return np.linalg.inv(z.dot(z.T)).dot(z.dot(y)).filled()[1:]

In [251]:
M_causal = 30
M_tag    = 2000
N_sample = 8000

In [252]:
x = X[1]
p = P[1]

In [272]:
M_indices = nr.choice(x.shape[0], M_causal + M_tag, False)
M_causal_indices = M_indices[:M_causal]
M_tag_indices    = M_indices[M_causal:]

In [283]:
N_indices = nr.choice(x.shape[1], 2*N_sample, False)
N_indices = (N_indices[:N_sample], N_indices[N_sample:])

In [292]:
b = nr.normal(size=M_causal)

In [293]:
y = (x[M_causal_indices,:] - p[M_causal_indices,np.newaxis]).T.dot(b).filled()

In [294]:
ghats = [BLUE(x[M_tag_indices][:,i], y[i]) for i in N_indices]

In [295]:
np.corrcoef(ghats)[1,0]

0.241885237325727

# Phenotypes

In [None]:
def simulate_phenotypes(X, M_causal = 30, rho = 1, h2g = 1):
    '''Simulates phenotypes
    X: genotype matrix
    M_causal: number of causal SNPs [30]
    rho: per-allele effect size correlation [1]
    h2g: h2g in each population [1]'''
    populations = len(X)
    
    if type(h2g) is int:
        h2g = [h2g]*populations

    M = X[0].shape[0]

    causal = nr.choice(M, M_causal, False)
    
    indices = np.zeros(M, dtype='b')
    indices[causal] = 1
    
    B = np.zeros((M, 2))
    B[causal] = nr.multivariate_normal((0,0), ((1,rho),(rho,1)), M_causal)

    XTB = [x.T.dot(b) for x, b in zip(X, B.T)]

    S2e = [(1-h)/h*np.var(xtb) for h, xtb in zip(h2g, XTB)]

    Y = [xtb + (nr.normal(0, np.sqrt(s2e), xtb.shape) if s2e else 0) for xtb, s2e in zip(XTB, S2e)]

    return (Y, B, indices)

In [None]:
(Y, B, indices) = simulate_phenotypes(X)

In [None]:
np.corrcoef(B.T)

# BLUE

In [None]:
Bhat = [BLUE(x[indices==0], y) for x, y in zip(X, Y)]

In [None]:
np.corrcoef(Bhat)

In [None]:
Bhat2 = [BLUE(x, y) for x, y in zip(X, Y)]

In [None]:
np.corrcoef(Bhat2)

In [None]:
def subsample_fit(N):
    sample = nr.choice(Ns[1], N, False)
    bhat = BLUE(x[:,sample], y[sample])
    r = np.corrcoef(bhat_16k, bhat)
    return(bhat, r)

In [None]:
k8 = [subsample_fit(14000) for i in range(100)]

In [None]:
k10 = [subsample_fit(14000) for i in range(100)]

In [None]:
k12 = [subsample_fit(12000) for i in range(100)]

In [None]:
k14 = [subsample_fit(14000) for i in range(100)]

In [None]:
def rstats (k):
    r = np.array([r[1,0] for bhat, r in k])
    return(r.mean(), np.var(r))

In [None]:
rstats(k8)

In [None]:
rstats(k10)

In [None]:
rstats(k12)

In [None]:
rstats(k14)

In [None]:
X[1].shape

In [None]:
from sklearn.decomposition import RandomizedPCA

In [None]:
rpca = RandomizedPCA(10)

In [None]:
rpca.fit(X[1])

In [None]:
XT = rpca.transform(X[1])

In [None]:
XT.shape

In [None]:
rpca.get_params()

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
tsvd = TruncatedSVD(10)

In [None]:
tsvd.fit(X[1].filled(0))

In [None]:
tsvd.components_.shape

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.scatter(tsvd.components_[0], tsvd.components_[1])

# LD Matrix

In [None]:
LD = [x.dot(x.T) for x in X]

In [None]:
import sklearn.linear_model

In [None]:
lm = sklearn.linear_model.LinearRegression()

In [None]:
lm.fit(X[0].T, Y[0])

In [None]:
lm.get_params()

In [None]:
bhat = BLUE(X[0], Y[0])

In [None]:
lm.coef_

In [None]:
bhat

In [None]:
%time BLUE(X[0], Y[0])

In [None]:
%time lm.fit(X[0].T, Y[0])

In [None]:
import sklearn.gaussian_process

In [None]:
gp = sklearn.gaussian_process.GaussianProcess()

In [None]:
gp.fit(X[0].T[sample[0]], Y[0][sample[0]])

In [None]:
sklearn.gaussian_process.l1_cross_distances()