# Create Linkage Map and append to Sardinia hdf5

In [1]:
import allel
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d

In [6]:
### # Load Recombination Map
h5_path_sard = "../../../ancient-sardinia/output/h5/mod_reich_sardinia_ancients_mrg_dedup_3trm_anno.h5"
path_snp =     "../../Data/1000Genomes/Markers/MinMyc.snp" # All SNPs found in the 1240k Ancient Panel

### Load HDF5
f = h5py.File(h5_path_sard, "r") # Load for Sanity Check. See below!
list(f.keys())
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

df_snp = pd.read_csv(path_snp, header=None, sep=r"\s*", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
print(f"Loaded {len(df_snp)} SNPs.")

['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'MAP', 'POS', 'REF', 'SAS_AF']
Loaded 1233013 SNPs.


In [3]:
def return_lmap(f, df_snp, chs=[]):
    """Port Linkage Map from df_snp to f"""
    pos = np.array(f["variants/POS"]).astype("int")  # Load into vmem to be faster
    chroms = np.array(f["variants/CHROM"]).astype("int")
    maps = -np.ones(len(pos)) # Prepare the empty Map

    if len(chs) == 0:
        chs = np.arange(1,23)

    # Return Linkage Map
    for ch in chs:
        print(f"\nDoing Chromosome {ch}")
        ids_ch = np.where(chroms==ch)[0]  # Remember the original Indices
        pos_t = pos[ids_ch]
        print(f"SNPs in HDF 5: {len(pos_t)}")

        df_t = df_snp[df_snp["chr"]==ch]
        print(f"SNPs from Linkage Map: {len(df_t)}")

        ### Find intersecting positions
        _, i1, i2 = np.intersect1d(pos_t, df_t["pos"].values, return_indices=True)
        print(f"Length Intersection: {len(i1)}")

        ### Prepare the Interpolation:
        itpld = interp1d(pos_t[i1], df_t["map"].values[i2], kind='linear')
        maps[ids_ch] = itpld(pos_t)  # Fill in the Values
        
    assert(np.min(maps)>-1)  # Sanity Check
    return maps

# Port Linkage Map to HDF5
### Be careful - messes with hdf5

In [5]:
maps = return_lmap(f, df_snp)
print("Finished Creating Map")

l = len(maps)
assert(l == len(f["variants/POS"]))

f.close()  # Now we get serious.

with h5py.File(h5_path_sard, 'a') as f0:
    group = f0["variants"]
    group.create_dataset('MAP', (l,), dtype='f')   
    f0["variants/MAP"][:] = maps[:]
    
print(f"Finished Modification")


Doing Chromosome 1
SNPs in HDF 5: 93073
SNPs from Linkage Map: 93166
Length Intersection: 92953

Doing Chromosome 2
SNPs in HDF 5: 98549
SNPs from Linkage Map: 98657
Length Intersection: 98424

Doing Chromosome 3
SNPs in HDF 5: 81310
SNPs from Linkage Map: 81416
Length Intersection: 81215

Doing Chromosome 4
SNPs in HDF 5: 71540
SNPs from Linkage Map: 71634
Length Intersection: 71456

Doing Chromosome 5
SNPs in HDF 5: 73915
SNPs from Linkage Map: 74004
Length Intersection: 73829

Doing Chromosome 6
SNPs in HDF 5: 78784
SNPs from Linkage Map: 78867
Length Intersection: 78708

Doing Chromosome 7
SNPs in HDF 5: 62556
SNPs from Linkage Map: 62595
Length Intersection: 62454

Doing Chromosome 8
SNPs in HDF 5: 63829
SNPs from Linkage Map: 63916
Length Intersection: 63760

Doing Chromosome 9
SNPs in HDF 5: 52706
SNPs from Linkage Map: 52765
Length Intersection: 52649

Doing Chromosome 10
SNPs in HDF 5: 61038
SNPs from Linkage Map: 61131
Length Intersection: 60978

Doing Chromosome 11
SNPs in 

# Test
### Check whether everything worked

In [7]:
f = h5py.File(h5_path_sard, "r") # Load for Sanity Check. See below!

list(f.keys())
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'MAP', 'POS', 'REF', 'SAS_AF']
