# Notebook to test whether D. Reichs big Eigenstrat File can be loaded

In [1]:
import numpy as np
import pandas as pd
import os as os
import sys as sys
import multiprocessing as mp
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


# Direct Code Testing Playgroud
(For Python package test see below)

In [2]:
### Base Path
folder = "./Data/Olalde19/Olalde_et_al_genotypes"
#folder = "./Data/Freilich20/AncCroatia1240KallSNPs"

### Load the SNP File

In [7]:
%%time
# On Harald's machine: About 7.14s
rsnp = np.loadtxt(folder + ".snp", dtype="str") 
print(f"Loaded {len(rsnp)} SNPs")

Loaded 1233013 SNPs
CPU times: user 5.6 s, sys: 320 ms, total: 5.92 s
Wall time: 5.93 s


## Load the Geno File

In [57]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
#if header.split()[0]!="GENO":
#    raise Exception("This does not look like a packedancestrymap file")
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here
geno_file.seek(rlen)         #set pointer to start of genotypes

geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
geno=np.unpackbits(geno, axis=1)[:,:(2*nind)]
geno=2*geno[:,::2]+geno[:,1::2]
#geno=geno[:,:]
geno[geno==3]=9                       #set missing values
geno = geno[:,0]

print(nind)
print(nsnp)

278
1233013
CPU times: user 2.15 s, sys: 483 ms, total: 2.64 s
Wall time: 2.64 s


## Run it piece by piece
(for debugging)

In [58]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here

CPU times: user 1.18 ms, sys: 0 ns, total: 1.18 ms
Wall time: 1.21 ms


In [59]:
nind

278

In [60]:
rlen

70

In [61]:
%%time
geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
print(geno.shape)

(1233013, 70)
CPU times: user 0 ns, sys: 148 ms, total: 148 ms
Wall time: 151 ms


### Unpack only the first couple of individuals
(set by n unload)

In [62]:
n_sub =8
rlen_sub = int(np.ceil(n_sub*2/8))
print(rlen_sub)

2


In [63]:
geno_sub = geno[:,:rlen_sub]
geno_sub=np.unpackbits(geno_sub, axis=1)[:,:(2*n_sub)]
geno_sub=2*geno_sub[:,::2]+geno_sub[:,1::2]
#geno=geno[:,:]
geno_sub[geno_sub==3]=9                       #set missing values
#geno_sub = geno_sub[:,0]

print(np.shape(geno_sub))

(1233013, 8)


In [64]:
np.sum(geno_sub[:,6]!=9)

1051699

# Test the Python Support Package

In [2]:
sys.path.append("./PackagesSupport/loadEigenstrat/")  # Since now we are in the Root Directory
from loadEigenstrat import EigenstratLoad, load_eigenstrat

In [3]:
%%time
### Packed
#basepath = "./Data/Olalde19/Olalde_et_al_genotypes"
#es = EigenstratLoad(base_path = basepath, sep="\t")

### Unpacked
basepath = "./Data/Freilich20/AncCroatia1240KallSNPs"
es = load_eigenstrat(basepath, output=True, 
                     sep=r"\t", packed=False)

3 Eigenstrat Files with 28 Individuals and 1233013 SNPs
CPU times: user 5.67 s, sys: 245 ms, total: 5.92 s
Wall time: 5.92 s


In [4]:
es.df_snp.head(3)

Unnamed: 0,SNP,chr,map,pos,ref,alt
0,1_752566,1,0,752566,G,A
1,1_776546,1,0,776546,A,G
2,1_832918,1,0,832918,T,C


In [5]:
es.df_ind.head(3)

Unnamed: 0,iid,sex,cls
0,JAG34,M,Croatia_Jagodnjak_MBA
1,JAG58,M,Croatia_Jagodnjak_MBA
2,JAG06,M,Croatia_Jagodnjak_MBA


In [5]:
for i in range(3):
    g = es.get_geno_i(i)
    print(f"Avg. Coverage: {1 - np.mean(g==3):.6f}")
    print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.745962
SNps covered: 919781
Avg. Coverage: 0.618166
SNps covered: 762207
Avg. Coverage: 0.686809
SNps covered: 846845


In [6]:
g = es.get_geno_i(5)
print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.596029
SNps covered: 734912


In [6]:
es.give_ref_alt(3)

(array(['A', 'G', 'T', ..., 'C', 'A', 'T'], dtype=object),
 array(['C', 'A', 'C', ..., 'T', 'G', 'G'], dtype=object))

In [7]:
gt = es.extract_snps(0, range(10))

In [19]:
ps = es.give_positions(ch=3)

# Area 51

### Test Loading Suzanne's data

In [None]:
basepath = "./Data/Freilich20/AncCroatia1240KallSNPs"

In [14]:
%%time
path_snp = basepath + ".snp"
df_snp = pd.read_csv(path_snp, header=None,
                     sep="\t", engine="python")
df_snp.columns = ["SNP", "chr", "map",
                  "pos", "ref", "alt"]  # Set the Columns

CPU times: user 7.89 s, sys: 143 ms, total: 8.04 s
Wall time: 8.05 s


In [16]:
df_snp.head(5)

Unnamed: 0,SNP,chr,map,pos,ref,alt
0,1_752566,1,0,752566,G,A
1,1_776546,1,0,776546,A,G
2,1_832918,1,0,832918,T,C
3,1_842013,1,0,842013,T,G
4,1_846864,1,0,846864,G,C


In [17]:
%%time
path_ind = basepath + ".ind"
df_ind = pd.read_csv(path_ind, header=None,
                     sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "cls"]  # Set the Columns

CPU times: user 5.7 ms, sys: 105 µs, total: 5.8 ms
Wall time: 12.3 ms


In [52]:
%%time
basepath = "./Data/Freilich20/AncCroatia1240KallSNPs"
geno=np.genfromtxt(basepath + ".geno", dtype='i1', delimiter=1)

###If we only loaded one individual, don't drop the second dimension.
if len(geno.shape)==1: 
    geno.shape=(geno.shape[0],1)

CPU times: user 29.3 s, sys: 383 ms, total: 29.7 s
Wall time: 29.8 s


In [53]:
np.shape(geno)

(1233013, 28)

In [54]:
np.shape(df_ind)

(28, 3)

In [55]:
np.shape(df_snp)

(1233013, 6)

In [58]:
geno[1]

array([9, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 0, 9, 2, 2, 9, 9, 2, 9,
       2, 2, 2, 9, 2, 2], dtype=int8)

### Load Particular Individual

In [64]:
%%time
basepath = "./Data/Freilich20/AncCroatia1240KallSNPs"
geno=np.genfromtxt(basepath + ".geno", dtype='i1', delimiter=1, usecols=27)

###If we only loaded one individual, don't drop the second dimension.
#if len(geno.shape)==1: 
#    geno.shape=(geno.shape[0],1)

CPU times: user 13.4 s, sys: 104 ms, total: 13.5 s
Wall time: 13.5 s
