In [None]:
##Author: Gene Burinskiy

!pip install plinkio
#!pip install h5py --for some reason, h5py doesn't install :/
#!pip install tables --since h5py can't be installed, neither can tables

In [1]:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
os.getcwd()

'/home/jovyan/work/STA-663-Final-Project/data'

In [2]:
#working with original dataset
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)
os.listdir()

['extracts',
 'dataset1-Copy1.bed',
 'dataset1.bed',
 'dataset1.cov',
 '.pversion',
 'dataset1.bim',
 'dataset1.fam',
 'dataset1.phe.liab',
 'dataset1.phe']

In [3]:
bed = plinkfile.open("dataset1")

In [4]:
loci = bed.get_loci()
len(loci)

10499

In [5]:
locus = loci[0]

In [7]:
locus.name

'csnp18'

In [8]:
locus.chromosome

1

In [9]:
np.unique([x.chromosome for x in loci])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [7]:
samples = bed.get_samples()
print("Object of type: ",type(samples), "and length:" ,len(samples))

Object of type:  <class 'list'> and length: 1000


In [8]:
sample = samples[500]

In [9]:
print(sample.fid, sample.father_iid, sample.iid, sample.phenotype, sample.sex)

FAM1 0 person501 3.591670036315918 1


In [10]:
h = [row for row in bed][0]

In [11]:
len([x for x in h])

1000

In [12]:
##each row in bed is of length 10000 and there are 10,499 rows thus to make a matrix:
mat = np.zeros((10499,1000), dtype='int16') #1/4 of the taken up space

i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1

In [13]:
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print(mat[:2,:5])
print("Size of bed matrix: %4.0fmb" %(mat.nbytes/(1024**2)))

Data type: int16
[[2 1 0 2 0]
 [1 0 2 1 1]]
Size of bed matrix:   20mb


In [14]:
df = pd.DataFrame(mat.transpose()) #normally, it reads them in as floats which is a huge waste of space
df.columns = [x.name for x in loci]
df.index = [x.iid for x in bed.get_samples()] #could also double index on chromosomes
df.iloc[:5,:5]

Unnamed: 0,csnp18,csnp35,csnp59,csnp78,csnp85
person1,2,1,1,2,1
person2,1,0,2,2,2
person3,0,2,2,2,2
person4,2,1,2,2,1
person5,0,1,2,1,2


In [23]:
np.unique(df.dtypes)

array([dtype('float64')], dtype=object)

In [39]:
df = df.astype('float32')-df.astype('float32').mean() #this gets us pretty close to their normalization stuff
df.iloc[:5,:5]

Unnamed: 0,csnp18,csnp35,csnp59,csnp78,csnp85
person1,0.68,-0.191,-0.561,0.675,-0.734
person2,-0.32,-1.191,0.439,0.675,0.266
person3,-1.32,0.809,0.439,0.675,0.266
person4,0.68,-0.191,0.439,0.675,-0.734
person5,-1.32,-0.191,0.439,-0.325,0.266


In [None]:
##Save the file to sql db - not feasible for this data
#from sqlalchemy import create_engine

#engine = create_engine('sqlite:///dataset1.db', echo=False)
#df.transpose().to_sql(name='dataset1', con=engine, if_exists = 'replace', index=True)


In [46]:
%%timeit
np.cov(df)

1 loop, best of 3: 351 ms per loop


In [48]:
cov = np.cov(df)
print("Shave of covariance matrix:", cov.shape)
cov[:5,:5]

Shave of covariance matrix: (1000, 1000)


array([[ 0.36811042,  0.00135157, -0.00864516, -0.00117112,  0.00388046],
       [ 0.00135157,  0.35819185,  0.00338288,  0.0022566 ,  0.00138279],
       [-0.00864516,  0.00338288,  0.3628521 ,  0.00443158, -0.00057136],
       [-0.00117112,  0.0022566 ,  0.00443158,  0.36309709,  0.00184398],
       [ 0.00388046,  0.00138279, -0.00057136,  0.00184398,  0.37099303]])

In [None]:
%%timeit


In [20]:
"""
Dependencies don't quite exist. 
from pandas import HDFStore

hdf = HDFStore('dataset1.h5')
# put the dataset in the storage
hdf.put('dataset1', df, format='table', data_columns=True)

hdf.append('d1', DataFrame(np.random.rand(5,3), 
           columns=('A','B','C')), 
           format='table', data_columns=True)
hdf.close() # closes the file

hdf = read_hdf('storage.h5', 'd1',
               where=['A>.5'], columns=['A','B'])
"""              

"\nDependencies don't quite exist. \nfrom pandas import HDFStore\n\nhdf = HDFStore('dataset1.h5')\n# put the dataset in the storage\nhdf.put('dataset1', df, format='table', data_columns=True)\n\nhdf.append('d1', DataFrame(np.random.rand(5,3), \n           columns=('A','B','C')), \n           format='table', data_columns=True)\nhdf.close() # closes the file\n\nhdf = read_hdf('storage.h5', 'd1',\n               where=['A>.5'], columns=['A','B'])\n"