# Modify the Sardinia HDF5 (in particular, the HO samples)
Add updated IIDs

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import h5py

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")


sys.path.append("./package/")  # Since now we are in the Root Directory
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
#from hapsburg.PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

### Load the Meta File
meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
meta_df = pd.read_csv(meta_path)
mod_df = meta_df[1098:]

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [5]:
def load_h5(path="./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"):
    """Load and return the HDF5 File from Path"""
    f = h5py.File(path, "r")  # Load for Sanity Check. See below!
    print("\nLoaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    # print(list(f["calldata"].keys()))
    # print(list(f["variants"].keys()))
    print(f"HDF5 loaded from {path}")
    return f

def load_iid_df(meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"):
    """Load IID df as well dictionary to translate"""
    df = pd.read_csv(meta_path_targets)
    dct_iids = dict(zip(df["iid"], df["full_iid"]))
    return df, dct_iids

### Loading f for checking
#f = load_h5(path="./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5")
#len(set(f["samples"]))

### First Copy over the original 
(runs shell command)

In [15]:
path_h5 = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
path_h5_mod = "./Data/Marcus2019_1240k/sardinia_hapsburg.h5"
!cp $path_h5 $path_h5_mod

### Now fill in new sample names

In [24]:
### Load the Metadata
df, dct_iid = load_iid_df()

### Sanity Check
f = load_h5(path=path_h5)
assert((f["samples"][:] == df["full_iid"]).all())
f.close()

### Now write it over
f1 = h5py.File(path_h5_mod, 'r+')     # open the file
data = f1['samples']                  # load the data
data[...] = df["iid"]                 # assign new values to data
f1.close()                            # close the file

print(f"Successfully written to {path_h5_mod}")


Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5
Successfully written to ./Data/Marcus2019_1240k/sardinia_hapsburg.h5


# Area 51

In [27]:
f = load_h5(path=path_h5_mod)
samples = f["samples"][:]
f.close()


Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/sardinia_hapsburg.h5


In [29]:
[s for s in samples if "Hadza" in s]

['Hadza_0', 'Hadza_1', 'Hadza_2', 'Hadza_3', 'Hadza_4']