In [41]:
!pip install plinkio
##Load data:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
import time
#from scipy.linalg.blas import dsyrk 
    #--can't find a way to get this working. Perhaps blas routines are missing.
    
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)    



In [3]:
"""
author: gene burinskiy

Goal: 
Finding a set of individuals who are related to other individuals in the study. 
LEAP employs a greedy algorithm to find a small subset of such individuals, 
such that after their exclusion, there are no related individuals in the study. 
These individuals are excluded from the analysis in stages 3 and 4 below, 
but after fitting a model in stage 4, their liabilities are estimated along with 
other indviduals. All individuals are considered in the GWAS stage (stage 5).

source: 
https://github.com/omerwe/LEAP/blob/master/leap/regression/Leap_example.ipynb
"""

'\nauthor: gene burinskiy\n\nGoal: \nFinding a set of individuals who are related to other individuals in the study. \nLEAP employs a greedy algorithm to find a small subset of such individuals, \nsuch that after their exclusion, there are no related individuals in the study. \nThese individuals are excluded from the analysis in stages 3 and 4 below, \nbut after fitting a model in stage 4, their liabilities are estimated along with \nother indviduals. All individuals are considered in the GWAS stage (stage 5).\n\nsource: \nhttps://github.com/omerwe/LEAP/blob/master/leap/regression/Leap_example.ipynb\n'

In [44]:
##Load data:
bed = plinkfile.open("dataset1")

loci = bed.get_loci()
print("Length of locuses", len(loci))
chromosomes = np.unique([x.chromosome for x in loci])
print("# of chromosomes in data:",chromosomes)

samples = bed.get_samples()
print("Number of individuals in data:", len(samples))

Length of locuses 10499
# of chromosomes in data: [ 1  2  3  4  5  6  7  8  9 10]
Number of individuals in data: 1000


In [45]:
##Place data into a dataframe:
mat = np.zeros((len(loci),len(samples)), dtype='int16') #1/4 of the taken up space by using int16

##don't know a faster method of extracting the data from the bed file.
i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1
    
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print("Size of bed matrix: %4.0fmb\n" %(mat.nbytes/(1024**2)))

#create a multi-indexed column space
tuples = [(x.chromosome,x.name) for x in loci]
ml_index = pd.MultiIndex.from_tuples(tuples, names = ['chromosome', 'snp'])

df = pd.DataFrame(mat.transpose(), columns=ml_index, index = [x.iid for x in bed.get_samples()]) 
df.info()
df.iloc[:5,:5]

Data type: int16
Size of bed matrix:   20mb

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: int16(10499)
memory usage: 20.0+ MB


chromosome,1,1,1,1,1
snp,csnp18,csnp35,csnp59,csnp78,csnp85
person1,2,1,1,2,1
person2,1,0,2,2,2
person3,0,2,2,2,2
person4,2,1,2,2,1
person5,0,1,2,1,2


In [46]:
##compute covariance matrix between individuals, remove those who are too close to each other.
#they LEAP code uses dsyrk which halves the computational time. Alas, we can't use it y

df = df.astype('float32')-df.astype('float32').mean() 
df.info() #roughly doubled memory usage though still not the 80mb it was earlier

cov = np.dot(df, df.transpose())/df.shape[1] #having difficulties with scipy's linalg module
#note that the above takes more than half the time of np.cov
print("\nCovariance shape:" , cov.shape)
print("Covariance memory usage in mb:", cov.nbytes/(1024**2))
cov[:5,:5]

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: float32(10499)
memory usage: 40.1+ MB

Covariance shape: (1000, 1000)
Covariance memory usage in mb: 3.814697265625


array([[ 0.36813205,  0.00128837, -0.00865506, -0.00119463,  0.00389233],
       [ 0.00128837,  0.35822785,  0.00339447,  0.00228265,  0.00136904],
       [-0.00865506,  0.00339447,  0.36281952,  0.00443562, -0.00057362],
       [-0.00119463,  0.00228265,  0.00443562,  0.3630724 ,  0.00183871],
       [ 0.00389233,  0.00136904, -0.00057362,  0.00183871,  0.37096033]], dtype=float32)

In [47]:
cutoff = .05
bool_arr =  np.tril(cov, k=-1)>cutoff
x_idx,y_idx = np.where(bool_arr)
print("shape of x:", x_idx.shape)
print("shape of y:", y_idx.shape)
print("\nremoving %d individuals" %x_idx.shape[0])

indxToExclude = np.array([x_idx, y_idx]).transpose() #note, they marked 54 so we marked more peeps.

shape of x: (56,)
shape of y: (56,)

removing 56 individuals


In [48]:
#with multi-index, we index by using the number of the chromosome. 
#This avoids copying of data -> we use views on the data. Immeasurably more efficient
for chrom in chromosomes:
    print("Working on chromosome: %s" %chrom)
    
    exclude_chrom = set(chromosomes)
    exclude_chrom.remove(chrom) #set all chromosomes except current
    exclude_chrom = list(exclude_chrom)
    
    t0 = time.time()
    #Note that the original code puts cov, w, V into a dictionary called "eigen"
    cov = np.dot(df[exclude_chrom], df[exclude_chrom].transpose())/df[exclude_chrom].shape[1]
    
    w,V = np.linalg.eigh(cov, 'L') #would use scipy except -again- can't get it to load.
    
    print("Took %.2f seconds" %(time.time()-t0))
print("Note that LEAP's original code runs 2-3 times slower for this step")    

Working on chromosome: 1
Took 0.26 seconds
Working on chromosome: 2
Took 0.16 seconds
Working on chromosome: 3
Took 0.15 seconds
Working on chromosome: 4
Took 0.17 seconds
Working on chromosome: 5
Took 0.16 seconds
Working on chromosome: 6
Took 0.30 seconds
Working on chromosome: 7
Took 0.19 seconds
Working on chromosome: 8
Took 0.16 seconds
Working on chromosome: 9
Took 0.14 seconds
Working on chromosome: 10
Took 0.14 seconds
Note that LEAP's original code runs 2-3 times slower for this step


In [40]:
##Our calc_h2 function for Step 3
#uses the calc_h2.calc_h2 functions

#read in phenofile:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=None, engine='c')
phenos.columns = ['fam', 'person', 'pheno']
phenos.set_index(keys = 'person', inplace=True)
phenos.iloc[:5,:]

prevalence = .001

cov 

indxToExclude





array([ 20,   6,  37,   8,  47,  15,  62,  16,  66,  14,  76,  22,  82,
        58,  89,  15,  89,  47, 104,  91, 112, 101, 128,   0, 142, 102,
       151,  83, 156, 141, 160, 102, 160, 142, 161,  43, 162,  54, 163,
       135, 164, 106, 165,   8, 165,  37, 172,  35, 173, 140, 187, 186,
       189,  54, 189, 162, 194,  88, 196,  25, 197,  70, 199, 150, 205,
        16, 205,  62, 209,  25, 209, 196, 210,   5, 211, 146, 213,  73,
       214,  90, 217,  95, 221, 108, 222,  51, 232, 122, 233, 178, 234,
        39, 241, 141, 241, 156, 242,  75, 244, 147, 246,  83, 246, 151,
       355, 152, 439,  61, 499,  47, 499,  89])

In [33]:
%%timeit
np.dot(df, df.transpose())/df.shape[1]

1 loop, best of 3: 154 ms per loop


In [32]:
%%timeit
np.cov(df)

1 loop, best of 3: 354 ms per loop
