In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
import socket
import os as os
import sys as sys
import multiprocessing as mp
import matplotlib.colors as cls
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### To load Eigenstrat  
sys.path.append("./PackagesSupport/loadEigenstrat/")  # Since now we are in the Root Directory
from loadEigenstrat import EigenstratLoad, load_eigenstrat

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [28]:
def extract_only_eigenstrat(df, path_es, packed=False, sep="\t"):
    """Extract only raws of df that are found in .ind file at path_es
    df: Dataframe with iid column. path_es: Eigenstrat basepath"""
    es_load = load_eigenstrat(base_path=path_es, packed=packed, sep=sep)
    df_ind = es_load.df_ind  # Get the Individual List
    idx = df["iid"].isin(df_ind["iid"])
    print(f"Found {np.sum(idx)}/{len(df_ind)} Individuals of Eigenstrat")
    df_found = df[idx]
    return df_found

In [14]:
path_meta_raw = "./Data/Freilich20/ancient.croatia.metadata.csv"
path_es = "./Data/Freilich20/AncCroatia1240KallSNPs"
save_path = "./Data/Freilich20/meta_processed.csv"

In [30]:
df_raw = pd.read_csv(path_meta_raw)
print(f"Loaded {len(df_raw)} Individuals from {path_meta_raw}")
rename_dict = {"id":"iid",
               "calBCE/BP_or_contextual dating_(BCE)  ":"age_range",
               "cluster":"clst",
               'SNPs_overlapping_1240K':"n_cov_snp"
               }

df_raw.rename(columns=rename_dict, inplace=True)
df_raw["study"]="Freilich20"
df_raw["include_alt"] = 1
df_raw["mean_cov"]=df_raw["n_cov_snp"]/1.2e6
df_raw["age"]=np.nan ### Implement this
df_full = df_raw[["iid", "lat", "lon", "age", "age_range", "study", "clst", "mean_cov", "n_cov_snp", "include_alt"]]
df_full = extract_only_eigenstrat(df_full, path_es = path_es) 

if len(save_path)> 0:
    df_full.to_csv(save_path, index=None)  # sep='\t' would not work with downstream analysis
    print(f"Saved {len(df_full)} Individuals to {save_path}")

Loaded 28 Individuals from ./Data/Freilich20/ancient.croatia.metadata.csv
3 Eigenstrat Files with 28 Individuals and 1233013 SNPs
Found 28/28 Individuals of Eigenstrat
Saved 28 Individuals to ./Data/Freilich20/meta_processed.csv


# Area 51

In [31]:
df_full

Unnamed: 0,iid,lat,lon,age,age_range,study,clst,mean_cov,n_cov_snp,include_alt
0,ZEM02,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.713307,855968,1
1,ZEM04,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.73683,884196,1
2,ZEM05,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.658676,790411,1
3,ZEM06,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.787207,944648,1
4,ZEM07,45.747,18.57,,4790-4558 calBCE,Freilich20,Croatia_MN,0.718101,861721,1
5,ZEM08,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.67163,805956,1
6,ZEM09,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.723044,867653,1
7,ZEM11,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.721572,865887,1
8,ZEM12,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.701008,841209,1
9,ZEM13,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.666721,800065,1
