# Prepare 1240k Coverage Means
Calculate the Proportionality Factors for the 1240k SNPs.
For this, load all the Sardinian ancients, and process them

Save the resulting list as position of all Markers/REF/ALT/Mean_Coverage csv (with Pandas)

In [28]:
import h5py
import numpy as np
import pandas as pd
import socket
import os as os

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
    
else:
    raise RuntimeError("Run on Harald's laptop!!")

# Step 1: Load ancient 1240k Data

In [5]:
####### Load all the neccessary files
### Set the right Paths
ancsard_git_path = "../../../ancient-sardinia/"  # Where to find the Ancient-Sardinia git Folder

h5path = ancsard_git_path + "output/h5_rev/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
meta_path = ancsard_git_path + "output/meta/meta_rev_final.csv"
save_snp_inds_path = ancsard_git_path + "output/snp_filter/snps_okay_ancients.npy"
save_params_path = ancsard_git_path + "output/snp_filter/snps_filter_cutoffs.csv"
param_path = ancsard_git_path + "data/meta/parameters/ancsard_params.csv"

#### Load the key Data
f = h5py.File(h5path, "r") # Load for Sanity Check. See below!
list(f.keys())
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

meta_df = pd.read_csv(meta_path)
print(len(meta_df))
print(np.shape(f["calldata/GT"]))

assert(len(meta_df)==np.shape(f["calldata/GT"])[1])

#######################################################
################################### Load the Parameters
df_prs = pd.read_csv(param_path)
anc_sardind = df_prs["n_ancsard"][0]     # The highest ancsard index
anc_ind = df_prs["n_anc"][0]

print(f"Nr Ancient Sardinians: {anc_sardind}")
print(f"Nr Ancient Samples: {anc_ind}")

['AD', 'GT']
['AA', 'AF', 'AFR_AF', 'ALT', 'AMR_AF', 'CHROM', 'EAS_AF', 'EUR_AF', 'ID', 'POS', 'REF', 'SAS_AF']
4616
(1145647, 4616, 2)
Nr Ancient Sardinians: 85
Nr Ancient Samples: 1087


## Step 2: Calculate Mean Coverage per Locus

In [7]:
def give_avg_depths(f, anc_ind, add_ancients=30):
    """f HDF5. anc_ind: Number of all ancient Individuals.
    add_ancients: Number of African ancient individuals 
    (which have slightly different Coverage)"""

    gt = f["calldata/AD"]    # Load the Genotypes
    gt_depth = np.mean(gt[:,:(anc_ind-add_ancients),:], axis=1)  # Calculate mean Read Depth per Site [l,2] Array
    gt_depthboth = np.mean(gt_depth, axis=1)  # The mean Depth over both Haplotypes

    ### Cut out the not covered markers
    depth_neg = gt_depthboth < 0
    #iid_neg = np.where(depth_neg)[0]
    #gt_depthboth[depth_neg] Is all -1
    depths = gt_depthboth[~depth_neg]
    return depths, ~depth_neg

In [8]:
%%time
depths, depth_idcs = give_avg_depths(f, anc_ind)
print(f"Loaded {np.sum(depth_idcs)} Loci Coverages")

Loaded 1144809 Loci Coverages
CPU times: user 33.7 s, sys: 1.61 s, total: 35.3 s
Wall time: 36.6 s


## Step 3: Prepare the Data Table and Save

In [11]:
all_depths = np.zeros(len(depth_idcs))  # Initialize the Coverages
all_depths[depth_idcs] = depths         # Set the Depths that worked (the rest remains 0)
norm_depths = all_depths / np.mean(all_depths)   # Normalize to get proportionality constants

In [19]:
pos = f["variants/POS"]
refs = f["variants/REF"]
alts =  f["variants/ALT"]
ch = f["variants/CHROM"]

df = pd.DataFrame({"Pos":pos, "Alt":alts, "Ref" : refs, "Lambda" : norm_depths, "Ch": ch})
#df["Ch"].value_counts()

In [31]:
### Save the Dataframe
savepath = "../../Data/1000Genomes/Coverage/mean_cov1240k_Marcus.csv"  # Where to save the df to

### Prepare the Folder
savefolder = os.path.dirname(savepath)
if not os.path.exists(savefolder):
    print(f"Creating new path: {savefolder}")
    os.makedirs(savefolder)

df.to_csv(savepath, index=False)
print(f"Successfully saved to {savepath}")

Successfully saved to ../../Data/1000Genomes/Coverage/mean_cov1240k_Marcus.csv


# Area 51

In [26]:
np.median(df["Lambda"])

0.6685279563213835