# Modify the Sardinia HDF5 (in particular, the HO samples)
Add updated IIDs

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import h5py

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")


sys.path.insert(0,"./package/")  # hack to get local package first in path
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
#from hapsburg.PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh
from hapsburg.PackagesSupport.h5_python.h5_functions import save_data_h5

#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

### Load the Meta File
meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
meta_df = pd.read_csv(meta_path)
mod_df = meta_df[1098:]

midway2-0406.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [2]:
def load_h5(path="./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"):
    """Load and return the HDF5 File from Path"""
    f = h5py.File(path, "r")  # Load for Sanity Check. See below!
    print("\nLoaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    # print(list(f["calldata"].keys()))
    # print(list(f["variants"].keys()))
    print(f"HDF5 loaded from {path}")
    return f

def load_iid_df(meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"):
    """Load IID df as well dictionary to translate"""
    df = pd.read_csv(meta_path_targets)
    dct_iids = dict(zip(df["iid"], df["full_iid"]))
    return df, dct_iids

### Loading f for checking
#f = load_h5(path="./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5")
#len(set(f["samples"]))

### First Copy over the original 
(runs shell command)

In [15]:
path_h5 = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
path_h5_mod = "./Data/Marcus2019_1240k/sardinia_hapsburg.h5"
!cp $path_h5 $path_h5_mod

### Now fill in new sample names

In [24]:
### Load the Metadata
df, dct_iid = load_iid_df()

### Sanity Check
f = load_h5(path=path_h5)
assert((f["samples"][:] == df["full_iid"]).all())
f.close()

### Now write it over
f1 = h5py.File(path_h5_mod, 'r+')     # open the file
data = f1['samples']                  # load the data
data[...] = df["iid"]                 # assign new values to data
f1.close()                            # close the file

print(f"Successfully written to {path_h5_mod}")


Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5
Successfully written to ./Data/Marcus2019_1240k/sardinia_hapsburg.h5


# Delete modern Sardinians

In [3]:
path = "./Data/Marcus2019_1240k/sardinia_hapsburg.h5"

with h5py.File(path, "r") as f: # Load for Sanity Check. See below!     
    print("Loaded HDF5")
    print("Loaded %i variants" % np.shape(f["calldata/AD"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/AD"])[1])
    print(list(f))
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    print(np.shape(f["calldata/GT"]))
    samples = f["samples"][:]
    
df = pd.read_csv("./Data/Marcus2019_1240k/meta_rev_final.csv")
assert(len(df)==len(samples))

Loaded HDF5
Loaded 1145647 variants
Loaded 4616 individuals
['calldata', 'samples', 'variants']
['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'POS', 'REF', 'SAS_AF']
(1145647, 4616, 2)


In [4]:
idx = df["study"]=="Chiang et al. 2016"
df1 = df[~idx].copy()
print(f"Filtered to {len(df1)}/{len(df)} non modern Sardinians")

Filtered to 3039/4616 non modern Sardinians


### Save downsampled Data

In [23]:
%%time

df = pd.read_csv("./Data/Marcus2019_1240k/meta_rev_final.csv")

with h5py.File("./Data/Marcus2019_1240k/sardinia_hapsburg.h5", "r") as f: # Load for Sanity Check. See below!     
    print("Loaded HDF5")
    print("Loaded %i variants" % np.shape(f["calldata/AD"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/AD"])[1])
    print(list(f))
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    samples = f["samples"][:]
    assert(len(df)==len(samples))
    ### Downsample to no modern Sardinians
    idx = ~(df["study"]=="Chiang et al. 2016")
    print(f"Saving {np.sum(idx)}/{len(idx)} Samples")

    save_data_h5(gt=f["calldata/GT"][:][:,idx,:].astype("int8"),
                 ad=f["calldata/AD"][:][:,idx,:].astype("int8"),
                 ref=f["variants/REF"][:],
                 alt=f["variants/ALT"][:],
                 pos=f["variants/POS"][:],
                 rec=[],
                 samples=f["samples"][:][idx],
                 path="./Data/Marcus2019_1240k/sardinia_hapsburg_nomodsards.h5",
                 gp=[],
                 af=f["variants/AF"][:],
                 compression='gzip',
                 ad_group=True,
                 gt_type='int8')

Loaded HDF5
Loaded 1145647 variants
Loaded 4616 individuals
['calldata', 'samples', 'variants']
['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'POS', 'REF', 'SAS_AF']
Saving 3039/4616 Samples
Successfully saved 3039 individuals to: ./Data/Marcus2019_1240k/sardinia_hapsburg_nomodsards.h5
CPU times: user 14min 59s, sys: 1min 18s, total: 16min 17s
Wall time: 16min 20s


In [20]:
import psutil
psutil.virtual_memory()

svmem(total=67099480064, available=57126854656, percent=14.9, used=6342938624, free=57400274944, active=4076462080, inactive=434561024, buffers=0, cached=3356266496, shared=3220881408, slab=285446144)

In [19]:
!free | awk 'FNR == 3 {print $3/($3+$4)*100}'

awk: cmd. line:1: (FILENAME=- FNR=3) fatal: division by zero attempted


# Area 51

In [27]:
f = load_h5(path=path_h5_mod)
samples = f["samples"][:]
f.close()


Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/sardinia_hapsburg.h5


In [29]:
[s for s in samples if "Hadza" in s]

['Hadza_0', 'Hadza_1', 'Hadza_2', 'Hadza_3', 'Hadza_4']

## Test no modern Sardinian hdf5

In [26]:
path = "./Data/Marcus2019_1240k/sardinia_hapsburg_nomodsards.h5"

with h5py.File(path, "r") as f: # Load for Sanity Check. See below!     
    print("Loaded HDF5")
    print("Loaded %i variants" % np.shape(f["calldata/AD"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/AD"])[1])
    print(list(f))
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    print(np.shape(f["calldata/GT"]))
    samples = f["samples"][:]
    gt = f["calldata/GT"][:10]
    ad = f["calldata/AD"][:10]
    
df = pd.read_csv("./Data/Marcus2019_1240k/meta_rev_final_nomod.tsv", sep="\t")
assert(len(df)==len(samples))

Loaded HDF5
Loaded 1145647 variants
Loaded 3039 individuals
['calldata', 'samples', 'variants']
['AD', 'GT']
['AF_ALL', 'ALT', 'POS', 'REF']
(1145647, 3039, 2)


In [9]:
samples

array([b'', b'', b'', ..., b'', b'', b''], dtype=object)