# Notebook to test whether D. Reichs big Eigenstrat File can be loaded

In [2]:
import numpy as np
import pandas as pd
import os as os
import sys as sys
import multiprocessing as mp
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

compute-e-16-236.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer
CPU Count: 28


# Direct Code Testing Playgroud
(For Python package test see below)

In [3]:
### Base Path
folder = "./Data/Lars20/1240k_ForHarald"
#folder = "./Data/Freilich20/AncCroatia1240KallSNPs"

### Load the SNP File

In [4]:
%%time
# On Harald's machine: About 7.14s
rsnp = np.loadtxt(folder + ".snp", dtype="str") 
print(f"Loaded {len(rsnp)} SNPs")

Loaded 1233013 SNPs
CPU times: user 9.53 s, sys: 359 ms, total: 9.89 s
Wall time: 9.81 s


In [5]:
rsnp

array([['rs3094315', '1', '0.007526', '752566', 'G', 'A'],
       ['rs12124819', '1', '0.007765', '776546', 'A', 'G'],
       ['rs28765502', '1', '0.008329', '832918', 'T', 'C'],
       ...,
       ['rs28628009', '24', '0.590331', '59033110', 'A', 'T'],
       ['rs55686319', '24', '0.590331', '59033139', 'T', 'C'],
       ['rs75089321', '24', '0.590332', '59033249', 'G', 'T']],
      dtype='<U15')

## Load the Geno File

In [57]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
#if header.split()[0]!="GENO":
#    raise Exception("This does not look like a packedancestrymap file")
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here
geno_file.seek(rlen)         #set pointer to start of genotypes

geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
geno=np.unpackbits(geno, axis=1)[:,:(2*nind)]
geno=2*geno[:,::2]+geno[:,1::2]
#geno=geno[:,:]
geno[geno==3]=9                       #set missing values
geno = geno[:,0]

print(nind)
print(nsnp)

278
1233013
CPU times: user 2.15 s, sys: 483 ms, total: 2.64 s
Wall time: 2.64 s


## Run it piece by piece
(for debugging)

In [58]:
%%time
geno_file = open(folder+".geno", "rb")
header=geno_file.read(20)         #Ignoring hashes
nind,nsnp=[int(x) for x in header.split()[1:3]]        
nind=nind
nsnp=nsnp
rlen=max(48,int(np.ceil(nind*2/8)))    #assuming sizeof(char)=1 here

CPU times: user 1.18 ms, sys: 0 ns, total: 1.18 ms
Wall time: 1.21 ms


In [59]:
nind

278

In [60]:
rlen

70

In [61]:
%%time
geno=np.fromfile(folder+".geno", dtype='uint8')[rlen:] #without header
geno.shape=(nsnp, rlen)
print(geno.shape)

(1233013, 70)
CPU times: user 0 ns, sys: 148 ms, total: 148 ms
Wall time: 151 ms


### Unpack only the first couple of individuals
(set by n unload)

In [62]:
n_sub =8
rlen_sub = int(np.ceil(n_sub*2/8))
print(rlen_sub)

2


In [63]:
geno_sub = geno[:,:rlen_sub]
geno_sub=np.unpackbits(geno_sub, axis=1)[:,:(2*n_sub)]
geno_sub=2*geno_sub[:,::2]+geno_sub[:,1::2]
#geno=geno[:,:]
geno_sub[geno_sub==3]=9                       #set missing values
#geno_sub = geno_sub[:,0]

print(np.shape(geno_sub))

(1233013, 8)


In [64]:
np.sum(geno_sub[:,6]!=9)

1051699

# Test the Python Support Package

In [3]:
sys.path.append("./PackagesSupport/loadEigenstrat/")  # Since now we are in the Root Directory
from loadEigenstrat import EigenstratLoad, load_eigenstrat

In [4]:
%%time
### Packed
#basepath = "./Data/Olalde19/Olalde_et_al_genotypes"
#es = EigenstratLoad(base_path = basepath, sep="\t")

### Unpacked
basepath = "./Data/Freilich20/AncCroatia1240KallSNPs"
es = load_eigenstrat(basepath, output=True, 
                     sep=r"\t", packed=False)

3 Eigenstrat Files with 28 Individuals and 1233013 SNPs
CPU times: user 6.58 s, sys: 437 ms, total: 7.02 s
Wall time: 7.58 s


In [5]:
es.df_snp.head(3)

Unnamed: 0,SNP,chr,map,pos,ref,alt
0,1_752566,1,0,752566,G,A
1,1_776546,1,0,776546,A,G
2,1_832918,1,0,832918,T,C


In [6]:
es.df_ind.head(3)

Unnamed: 0,iid,sex,cls
0,JAG34,M,Croatia_Jagodnjak_MBA
1,JAG58,M,Croatia_Jagodnjak_MBA
2,JAG06,M,Croatia_Jagodnjak_MBA


In [7]:
for i in range(3):
    g = es.get_geno_i(i)
    print(f"Avg. Coverage: {1 - np.mean(g==3):.6f}")
    print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.745962
SNps covered: 919781
Avg. Coverage: 0.618166
SNps covered: 762207
Avg. Coverage: 0.686809
SNps covered: 846845


In [6]:
g = es.get_geno_i(5)
print(f"Avg. Coverage: {1-np.mean(g==3):.6f}")
print(f"SNps covered: {np.sum(g!=3)}")

Avg. Coverage: 0.596029
SNps covered: 734912


In [8]:
es.give_ref_alt(3)

(array(['A', 'G', 'T', ..., 'C', 'A', 'T'], dtype=object),
 array(['C', 'A', 'C', ..., 'T', 'G', 'G'], dtype=object))

In [9]:
gt = es.extract_snps(0, range(10))

In [10]:
ps = es.give_positions(ch=3)

In [11]:
gt

array([[ 1, -1,  0,  0,  1, -1,  1, -1, -1,  0],
       [ 1, -1,  0,  0,  1, -1,  1, -1, -1,  0]], dtype=int8)

In [6]:
def load_ind_df(base_path, sep=r"\s+"):
    """Load the Individual dataframe.
    Uses self.base_path
    sep: What separator to use when loading the File"""
    path_ind = base_path + ".ind"
    df_ind = pd.read_csv(path_ind, header=None,
                         sep=r"\s+", engine="python")
    df_ind.columns = ["iid", "sex", "cls"]  # Set the Columns
    return df_ind

def load_snp_df(base_path, sep=r"\s+"):
    """Load the SNP dataframe.
    Uses self.base_path
    sep: What separator to use when loading the File"""
    path_snp = base_path + ".snp"
    df_snp = pd.read_csv(path_snp, header=None,
                         sep=sep, engine="python")
    df_snp.columns = ["SNP", "chr", "map",
                      "pos", "ref", "alt"]  # Set the Columns
    return df_snp

In [9]:
folder = "./Data/Lars20/1240k_ForHarald"

df = load_snp_df(folder, sep="\t")

ValueError: Length mismatch: Expected axis has 1 elements, new values have 6 elements

In [8]:
df

Unnamed: 0,SNP,chr,map,pos,ref,alt
0,rs3094315,1,0.007526,752566,G,A
1,rs12124819,1,0.007765,776546,A,G
2,rs28765502,1,0.008329,832918,T,C
3,rs7419119,1,0.008420,842013,T,G
4,rs950122,1,0.008469,846864,G,C
...,...,...,...,...,...,...
1233008,rs60847530,24,0.590306,59030572,T,A
1233009,snp_24_59033099,24,0.590331,59033099,T,C
1233010,rs28628009,24,0.590331,59033110,A,T
1233011,rs55686319,24,0.590331,59033139,T,C


# Area 51

### Test Loading Suzanne's data

In [55]:
np.shape(df_snp)

(1233013, 6)

### Load Particular Individual