# Notebook to test whether D. Reichs big Eigenstrat File can be loaded

In [1]:
import numpy as np
import pandas as pd
import os as os
import sys as sys
import multiprocessing as mp
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


### The BaseBath (without the .snp/ind/geno)

In [51]:
folder = "./Data/ReichLabEigenstrat/Olalde2019/Olalde_et_al_genotypes"

### Load the SNP File

In [56]:
%%time
# On Harald's machine: About 7.14s
rsnp = np.loadtxt(folder + ".snp", dtype="str") 
print(f"Loaded {len(rsnp)} SNPs")

Loaded 1233013 SNPs
CPU times: user 6.72 s, sys: 537 ms, total: 7.26 s
Wall time: 7.28 s


## Load the Geno File

In [57]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
#if header.split()[0]!="GENO":
#    raise Exception("This does not look like a packedancestrymap file")
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here
geno_file.seek(rlen)         #set pointer to start of genotypes

geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
geno=np.unpackbits(geno, axis=1)[:,:(2*nind)]
geno=2*geno[:,::2]+geno[:,1::2]
#geno=geno[:,:]
geno[geno==3]=9                       #set missing values
geno = geno[:,0]

print(nind)
print(nsnp)

278
1233013
CPU times: user 2.15 s, sys: 483 ms, total: 2.64 s
Wall time: 2.64 s


## Run it piece by piece
(for debugging)

In [58]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here

CPU times: user 1.18 ms, sys: 0 ns, total: 1.18 ms
Wall time: 1.21 ms


In [59]:
nind

278

In [60]:
rlen

70

In [61]:
%%time
geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
print(geno.shape)

(1233013, 70)
CPU times: user 0 ns, sys: 148 ms, total: 148 ms
Wall time: 151 ms


### Unpack only the first couple of individuals
(set by n unload)

In [62]:
n_sub =8
rlen_sub = int(np.ceil(n_sub*2/8))
print(rlen_sub)

2


In [63]:
geno_sub = geno[:,:rlen_sub]
geno_sub=np.unpackbits(geno_sub, axis=1)[:,:(2*n_sub)]
geno_sub=2*geno_sub[:,::2]+geno_sub[:,1::2]
#geno=geno[:,:]
geno_sub[geno_sub==3]=9                       #set missing values
#geno_sub = geno_sub[:,0]

print(np.shape(geno_sub))

(1233013, 8)


In [64]:
np.sum(geno_sub[:,6]!=9)

1051699

# And now write own class to extract specific Individual

In [39]:
class EigenstratLoad(object):
    """Class that loads and postprocesses Eigenstrats"""
    base_path = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K"
    nsnp = 0
    nind = 0
    rlen = 0
    
    def __init__(self, base_path=""):
        if len(base_path)==0:
            base_path = self.base_path # Go to Default
        geno_file = open(base_path +".geno", "rb")
        header = geno_file.read(20)         #Ignoring hashes
        self.nind, self.nsnp = [int(x) for x in header.split()[1:3]]        
        self.rlen=max(48, int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here
        print(f"File with {self.nind} Individuals and {self.nsnp} SNPs")
                       
    def get_geno_i(self, i, missing_val=3):
        """Load Individual i"""
        batch, eff = self.get_enc_index(i)
        geno = self.give_bit_file()  # Load the whole bit file

        geno_sub = geno[:,[batch]]  # Byte value of batch
        geno_sub = np.unpackbits(geno_sub, axis=1)[:,2*eff:2*eff+2]
        geno_sub = 2*geno_sub[:,0] + geno_sub[:,1]
        geno_sub[geno_sub==3] = missing_val  #set missing values
        return geno_sub
      
    def get_geno_indvidiaul_iid(self, iid):
        """Return Genotypes of Individual iid"""
        i = self.get_i_from_iid(iid)
        g = self.get_geno_i(i)
        return g
        
    def get_i_from_iid(self, iid):
        """Get the Index of Individual iid"""
        raise NotImplementedError("Implement this")
        
    def give_bit_file(self):
        base_path = self.base_path
        geno=np.fromfile(base_path+".geno", dtype='uint8')[self.rlen:] #without header
        geno.shape=(self.nsnp, self.rlen)
        return geno
    
    def get_enc_index(self, i):
        """Get the Index in the Encoding and the modulo 4 value
        (position in batch)"""
        rlen_sub = int(np.floor(i * 2/8)) # Effectively dividing by four
        mod_i = i % 4  # Calculate the rest
        return rlen_sub, mod_i

### Test the Code

In [63]:
%%time
es = EigenstratLoad(base_path = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K")
g = es.get_geno_i(0)

File with 5081 Individuals and 1233013 SNPs
CPU times: user 31.7 ms, sys: 848 ms, total: 880 ms
Wall time: 887 ms


In [80]:
%%time
es = EigenstratLoad(base_path = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K")
for i in range(10):
    g = es.get_geno_i(i)
    print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
    print(f"SNps covered: {np.sum(g!=3)}")

File with 5081 Individuals and 1233013 SNPs
Avg. Coverage: 0.048705
SNps covered: 60054
Avg. Coverage: 0.105648
SNps covered: 130265
Avg. Coverage: 0.112596
SNps covered: 138832
Avg. Coverage: 0.056045
SNps covered: 69104
Avg. Coverage: 0.435407
SNps covered: 536862
Avg. Coverage: 0.055553
SNps covered: 68498
Avg. Coverage: 0.631162
SNps covered: 778231
Avg. Coverage: 0.026451
SNps covered: 32614
Avg. Coverage: 0.091817
SNps covered: 113211
Avg. Coverage: 0.055737
SNps covered: 68724
CPU times: user 364 ms, sys: 7.19 s, total: 7.55 s
Wall time: 7.55 s


# Test the Python file

In [2]:
sys.path.append("./PackagesSupport/loadEigenstrat/")  # Since now we are in the Root Directory
from loadEigenstrat import EigenstratLoad

In [3]:
es = EigenstratLoad(base_path = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K")

File with 5081 Individuals and 1233013 SNPs


In [4]:
es.df_snp

array([['rs3094315', '1', '0.020130', '752566', 'G', 'A'],
       ['rs12124819', '1', '0.020242', '776546', 'A', 'G'],
       ['rs28765502', '1', '0.022137', '832918', 'T', 'C'],
       ...,
       ['rs28628009', '24', '0', '59033110', 'A', 'T'],
       ['rs55686319', '24', '0', '59033139', 'T', 'C'],
       ['rs75089321', '24', '0', '59033249', 'G', 'T']], dtype='<U15')

In [42]:
for i in range(10):
    g = es.get_geno_i(i)
    print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
    print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.048705
SNps covered: 60054
Avg. Coverage: 0.105648
SNps covered: 130265
Avg. Coverage: 0.112596
SNps covered: 138832
Avg. Coverage: 0.056045
SNps covered: 69104
Avg. Coverage: 0.435407
SNps covered: 536862
Avg. Coverage: 0.055553
SNps covered: 68498
Avg. Coverage: 0.631162
SNps covered: 778231
Avg. Coverage: 0.026451
SNps covered: 32614
Avg. Coverage: 0.091817
SNps covered: 113211
Avg. Coverage: 0.055737
SNps covered: 68724


In [28]:
g = es.get_geno_i(953)
print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.921500
SNps covered: 1136222


### And now try out the Olalde Paper

In [5]:
es = EigenstratLoad(base_path = "./Data/ReichLabEigenstrat/Olalde2019/Olalde_et_al_genotypes")

File with 278 Individuals and 1233013 SNPs


In [6]:
for i in range(10):
    g = es.get_geno_i(i)
    print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
    print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.122928
SNps covered: 151572
Avg. Coverage: 0.436765
SNps covered: 538537
Avg. Coverage: 0.047256
SNps covered: 58267
Avg. Coverage: 0.576944
SNps covered: 711380
Avg. Coverage: 0.120054
SNps covered: 148028
Avg. Coverage: 0.347864
SNps covered: 428921
Avg. Coverage: 0.852950
SNps covered: 1051699
Avg. Coverage: 0.098032
SNps covered: 120875
Avg. Coverage: 0.517977
SNps covered: 638672
Avg. Coverage: 0.203932
SNps covered: 251451


# Area 51

In [49]:
%%time
np.loadtxt("./Data/ReichLabEigenstrat/Olalde2019/Olalde_et_al_genotypes.snp", dtype="str")

CPU times: user 6.68 s, sys: 180 ms, total: 6.86 s
Wall time: 6.93 s


array([['rs3094315', '1', '0.020130', '752566', 'G', 'A'],
       ['rs12124819', '1', '0.020242', '776546', 'A', 'G'],
       ['rs28765502', '1', '0.022137', '832918', 'T', 'C'],
       ...,
       ['rs28628009', '24', '0', '59033110', 'A', 'T'],
       ['rs55686319', '24', '0', '59033139', 'T', 'C'],
       ['rs75089321', '24', '0', '59033249', 'G', 'T']], dtype='<U15')

In [14]:
%%time
df_snp = pd.read_csv("./Data/ReichLabEigenstrat/Olalde2019/Olalde_et_al_genotypes.snp", header=None, 
                     sep=r"\s+", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"] # Set the Columns

CPU times: user 9.65 s, sys: 131 ms, total: 9.78 s
Wall time: 9.79 s


In [20]:
ch =1
df_snp[df_snp["chr"]==ch]

Unnamed: 0,SNP,chr,map,pos,ref,alt
0,rs3094315,1,0.020130,752566,G,A
1,rs12124819,1,0.020242,776546,A,G
2,rs28765502,1,0.022137,832918,T,C
3,rs7419119,1,0.022518,842013,T,G
4,rs950122,1,0.022720,846864,G,C
5,rs113171913,1,0.023436,869303,C,T
6,rs13302957,1,0.024116,891021,G,A
7,rs59986066,1,0.024183,893462,C,T
8,rs112905931,1,0.024260,896271,C,T
9,rs6696609,1,0.024457,903426,C,T


In [41]:
df_t = df_snp.loc[df_snp["chr"]==1, ["ref", "alt"]]

In [42]:
ref, alt = df_t["ref"].values, df_t["alt"].values

In [44]:
alt

array(['A', 'G', 'C', ..., 'A', 'T', 'A'], dtype=object)

In [48]:
pos = df_snp.loc[df_snp["chr"]==ch, "pos"]
pos.values

array([   752566,    776546,    832918, ..., 249198164, 249202567,
       249210707])

In [51]:
%%time
df_ind = pd.read_csv("./Data/ReichLabEigenstrat/Olalde2019/Olalde_et_al_genotypes.ind", header=None, 
                     sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "cls"] # Set the Columns

CPU times: user 2.38 ms, sys: 3.95 ms, total: 6.33 ms
Wall time: 5.9 ms


In [53]:
len(df_ind)

278

In [55]:
df_ind.head(5)

Unnamed: 0,iid,sex,cls
0,I3807,M,SE_Iberia_c.10-16CE
1,I3981,M,SE_Iberia_c.5-8CE
2,I4055,M,SE_Iberia_c.3-4CE
3,I4246,M,C_Iberia_CA_Afr
4,I2472,M,N_Iberia_BA


In [61]:
np.where(df_ind["iid"]=='I4055')[0][0]

2