# Prepare the Sardinian Autosomes
Also scan for stretches of Homoyzgotes

In [1]:
import allel
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os   # For creating folders

In [2]:
### Some Helper Functions
# The same as over in prepare_Sardinian_X (or very similar)

def load_h5(path):
    """Load and return the HDF5 File from Path"""
    f = h5py.File(path, "r") # Load for Sanity Check. See below!
    print("\nLoaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    print(f"HDF5 loaded from {path}")
    return f

def merge_2hdf(f, g, ch=1):
    """ Merge two HDF 5 f and g. Return Indices of Overlap Individuals.
    f is Sardinian HDF5, 
    g the Reference HDF5
    ch: Integer, which Chromosome to use"""
    
    pos1 = f["variants/POS"]
    pos2 = g["variants/POS"]

    ### Check if in both Datasets
    b, i1, i2 = np.intersect1d(pos1, pos2, return_indices=True)

    print(f"\nIntersection on Positions: {len(b)}")

    ### Sanity Check if Reference is the same
    ref1 = np.array(f["variants/REF"])[i1]
    ref2 = np.array(f1000["variants/REF"])[i2]

    alt1 = np.array(np.array(f["variants/ALT"])[i1])
    alt2 = np.array(np.array(f1000["variants/ALT"])[i2,0])

    ### Downsample to Site where both Ref and Alt are the same
    same = (ref1 == ref2)
    print(f"Nr of Matching Refs: {np.sum(same)} / {len(same)}")

    both_same = (ref1 == ref2) & (alt1 == alt2)
    i11 = i1[both_same]
    i22 = i2[both_same]

    print(f"Full Intersection Ref/Alt Identical: {len(i11)} / {len(both_same)}")
    return i11, i22


def save_refs(gts, folder, cm_map, gt_individual=[]):
    """Save Sardinian references
    ids: Which individuals
    markers: Which markers
    folders: Into which folder
    Genotypes Individual: If given, save as well"""
    print(f"Nr of Markers used: {np.shape(gts)[1]}") # Notice that 1 and 0 Dim. are shifted!
    print(f"Nr of individuals saved: {np.shape(gts)[0]}")

    assert(len(gt_individual)==2) # Sanity Check

    if not os.path.exists(folder):
                os.makedirs(folder)
            
    np.savetxt(folder + "refs.csv", gts,
                       delimiter=",",  fmt='%i')  # Save Reference Haplotypes

    ### Save which individuals and markers
    np.savetxt(folder + "refs.csv", gts, 
                       delimiter=",",  fmt='%i')
    
    ### Save the cmap 
    np.savetxt(folder + "map.csv", cm_map, delimiter=",",  fmt='%.8f')
    
    if len(gt_individual)>0:
            np.savetxt(folder + "hap.csv", gt_individual, 
                       delimiter=",",  fmt='%i')
    print(f"Successfully saved to {folder}")

#######################################
### Code for saving Haplo
def save_haplo(folder, ref_hdf5, obs_hdf5, ids_ref, id_obs, 
               marker_ref, marker_obs, r_map, error_rate=0, only_calls=True):
    """Save Folder with all relevant Information.
    Folder: Where to save to
    ref_hdf5: Reference HDF5
    obs_hdf5: Observed HDF5
    ids_ref: Indices of reference Individuals to save
    ids_obs: Indices of observed Individuals
    marker_ref: Indices of reference Markers
    marker_obs: Indices of observed Markers
    error_rate: Whether to Include an Error Rate
    only_calls: Whether to Only Include Markers with Calls"""
    assert(len(marker_ref)==len(marker_obs))  # If reference and observe dataset are the same
    assert(len(marker_ref)==len(r_map))  # If Linkage Map fits as well
    

    gts = ref_hdf5["calldata/GT"][:, ids_ref, 0] # Extract Individuals (first haplo)
    gts = gts[marker_ref, :].T       # Important: Swap of Dimensions!!
    print("Extraction Complete!")

    gts_ind = obs_hdf5["calldata/GT"][:, id_obs, :] # Extract Individuals (first haplo)
    gts_ind = gts_ind[marker_obs, :].T 

    if only_calls == True:
        called = (gts_ind[0, :] > -1)  # Only Markers with calls
        print(f"Markers called {np.sum(called)} / {len(called)}")
        gts_ind = gts_ind[:, called]
        gts = gts[:, called]
        r_map = r_map[called]
        
    if error_rate>0:    # Do some Error Shennenigans
        e_ids = np.random.binomial(1, error_rate, 
                                    size=np.shape(gts_ind)).astype("bool") # Boolean Sample Vector
        print(f"Introducing {np.sum(e_ids)} Random Genotype Errors")
        gts_ind[e_ids] = 1 - gts_ind[e_ids] # Do a Flip
    
    save_refs(gts, folder, r_map, gt_individual=gts_ind)
    
    np.savetxt(folder + "ind.csv", [id_obs], delimiter=",",  fmt='%i')   # Save which Individuals were used

# Code to prepare datasets

In [24]:
ch = 15        # Which Chromosome to analyze
h5_path_sard = "../../ancient-sardinia/output/h5/mod_reich_sardinia_ancients_mrg_dedup_3trm_anno.h5"
h5_path1000g = "../Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr" + str(ch) + ".hdf5"

meta_path = "../../ancient-sardinia/output/meta/meta_final.csv"

fs = load_h5(h5_path_sard)
f1000 = load_h5(h5_path1000g)
i1, i2 = merge_2hdf(fs, f1000)

meta_df = pd.read_csv(meta_path)
assert(len(meta_df)==np.shape(fs["calldata/GT"])[1])  # Sanity Check
#map_pos, cm_map = get_rmap1240k(f1000, snp_ids=i2)


Loaded 1149314 variants
Loaded 4558 individuals
['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'MAP', 'POS', 'REF', 'SAS_AF']
HDF5 loaded from ../../ancient-sardinia/output/h5/mod_reich_sardinia_ancients_mrg_dedup_3trm_anno.h5

Loaded 34431 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'MAP', 'POS', 'QUAL', 'REF']
HDF5 loaded from ../Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr15.hdf5

Intersection on Positions: 34427
Nr of Matching Refs: 34284 / 34427
Full Intersection Ref/Alt Identical: 34247 / 34427


## Modern Sardinian

In [26]:
folder = "../Empirical/Sard_Chr20_1000G_ROH2/"         # Which folder to save into
id_obs = 3043
ids_ref = np.arange(503)  # All 503 EUR Samples as Reference (first Chromosome)
markers = np.arange(0, 28863) # Which Markers to Slice out

### Do Downsampling if needed
#sample = np.random.binomial(1, 0.5, size=len(markers)).astype("bool") # Boolean Sample Vector
#markers = markers[sample]
markers_obs = i1[markers]
markers_ref = i2[markers]

r_map = np.array(f1000["variants/MAP"])[markers]

save_haplo(folder, f1000, fs, ids_ref, id_obs, 
               markers_ref, markers_obs, r_map, error_rate=0.0)

Extraction Complete!
Markers called 13573 / 28863
Nr of Markers used: 13573
Nr of individuals saved: 503
Successfully saved to ../Empirical/Sard_Chr20_1000G_ROH2/


## Ancient Individual

In [25]:
iid = "I0413"

folder  = "../Empirical/I0413_I0413_1000G_ROH/"  # Which folder to save into

id_obs = np.where(meta_df["iid"] == iid)[0][0]

ids_ref = np.arange(503)  # All 503 EUR Samples as Reference (first Chromosome)
markers = np.arange(5000, len(i1)) # Which Markers to Slice out

### Do Downsampling if needed
#sample = np.random.binomial(1, 0.5, size=len(markers)).astype("bool") # Boolean Sample Vector
#markers = markers[sample]
markers_obs = i1[markers]
markers_ref = i2[markers]

r_map = np.array(f1000["variants/MAP"])[markers]

save_haplo(folder, f1000, fs, ids_ref, id_obs, 
               markers_ref, markers_obs, r_map, error_rate=0.0)

Extraction Complete!
Markers called 17792 / 29247
Nr of Markers used: 17792
Nr of individuals saved: 503
Successfully saved to ../Empirical/I0413_I0413_1000G_ROH/


# Area 51

In [26]:
meta_df[meta_df["iid"] == "I0413"]

Unnamed: 0,iid,label,lat,lon,x_contam,mt_contam,age,study,clst_alt,period_alt,include_alt,clst,mean_cov,med_cov,n_cov_snp,full_iid
1002,I0413,Europe_EN,42.5,0.5,,,7138.0,Lazaridis et al. 2016,Europe,EN,1.0,Iberia-EN,1.729239,1.0,678257.0,I0413


In [15]:
np.shape(fs["calldata/GT"][:20,10,:]

array([[ 1,  1],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [-1, -1],
       [ 1,  1],
       [ 1,  1],
       [-1, -1],
       [ 0,  0],
       [ 1,  1],
       [ 0,  0],
       [ 1,  1],
       [ 0,  0],
       [ 0,  0],
       [ 1,  1],
       [ 0,  0],
       [ 0,  0],
       [ 1,  1],
       [ 0,  0],
       [ 1,  1]], dtype=int8)