# Code to process the Ceballos samples

In [33]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
#from mpl_toolkits.basemap import Basemap

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

### To do lowess smoothing
#import statsmodels.api as sm
#lowess = sm.nonparametric.lowess

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Midway jnovmbre partition detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./package/hapsburg") # Append Hapsburg Folder
from PackagesSupport.roh_expectations import Expected_Roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [34]:
def load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het0_v3.csv"):
    """Load Dataframe of Ceballos"""
    df = pd.read_csv(path_ceballos)
    l = len(set(df["IID"]))
    print(f"Loaded {len(df)} ROH from {l} Individuals")

    df1 = df[["POS1", "POS2", "cM.1", "cM.2", "IID", "CHR"]].copy()
    df1.columns = ['Start', 'End', 'StartM', 'EndM','iid', 'ch']
    df1['length'] = df1["End"] - df1["Start"]
    df1["EndM"] = df1["EndM"]/100
    df1["StartM"] = df1["StartM"]/100
    df1["lengthM"] = df1['EndM'] - df1['StartM']
    return df1

def save_roh_to_ch(df, path="./Empirical/ceballos/Ust_Ishim/chr", ch=3, 
                   file="roh.csv", suffix="", output=True):
    """Save Chromosomes of hapROH dataframe
    path: The path without chromosomes number"""
    folder = path + str(ch) + suffix
    if not os.path.exists(folder):
        if output:
            print(f"Creating Path {folder}...")
        os.makedirs(folder)
    
    ### Extract and Save
    df_c = df[df["ch"]==ch].copy()
    save_path = os.path.join(folder,file)
    df_c.to_csv(save_path, sep=",", index=False)
    if output:
        print(f"Saved {len(df_c)} ROH to {save_path}")
    
def get_ceballos_iids(df, col_iid="iid"):
    """Given dataframe in Ceballos format, extract
    the pure iids. Return array of iids
    folder_out: Where to save the """
    temp = df["iid"].str.split("/").str[-1]
    #iids = temp.str.split(".").str[0].values
    return list(set(temp))

def create_ceballos_folders(path, folder_out="./Empirical/ceballos/",
                           chromosomes=False, output=False):
    df1 = load_df_ceballos(path_ceballos=path)
    l = len(set(df1["iid"]))
    
    iids = get_ceballos_iids(df1)
    assert(len(iids)==l) # Sanity check if unique IIDs are correct lenght
    print(f"Found {len(iids)} unique iids")
    for iid in iids:
        df2 = df1[df1["iid"].str.contains(iid)]
        
        if chromosomes:
            for ch in range(1,23):
                save_roh_to_ch(df2, path=f"{folder_out}{iid}/chr", 
                               ch=ch, file="roh.csv", suffix="", output=outut) # suffix="/e01/"
        else:
            if not os.path.exists(folder_out):
                if output:
                    print(f"Creating Path {folder_out}...")
                os.makedirs(folder_out)
            save_path = os.path.join(folder_out, iid + "_roh_full.csv")
            df2.to_csv(save_path, sep=",", index=False)
            if output:
                print(f"Saved {len(df_c)} ROH to {save_path}")

# Extract and save single Individual

In [40]:
### Load and convert dataframe of Ceballos
df1 = load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het1_v3.csv")
#df1 = load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het0_v3.csv")
#df1 = load_df_ceballos(path_ceballos="./Data/Ceballos/ROH500_cM_het1_4samples.csv")
# I6671, I2521, R7 and Villabruna

Loaded 65326 ROH from 505 Individuals


### Extract all Individuals from one table

In [36]:
create_ceballos_folders(path="./Data/Ceballos/ROH500_cM_het0_v3.csv",
                        folder_out="./Empirical/ceballos/het0/", chromosomes=False)

Loaded 28149 ROH from 499 Individuals
Found 499 unique iids


## Extract single Individual
Done for QC and checking. Skipable

In [42]:
iid = "R2.bam"
df2 = df1[df1["iid"].str.contains(iid)]

In [15]:
### Copy over ROH calls from hapROH
folder_from = f"./Empirical/1240k/MarcusAncs/{iid}/"
folder_to = f"./Empirical/ceballos/{iid}/"
!cp -r $folder_from $folder_to

In [None]:
## Copy in ROH calls from Ceballos
for ch in range(1,23):
    save_roh_to_ch(df2, path=f"./Empirical/ceballos/{iid}/chr", 
                   ch=ch, file="roh.csv", suffix="/e01/")

In [None]:
df2[df2["ch"]==4]

In [43]:
### Check out all ROH longer than thresholds
for c in [0.04,0.08,0.12,0.2]: 
    d = np.sum(df2[df2["lengthM"]>c]["lengthM"])
    print(f"ROH>{c}: {d}")

ROH>0.04: 0.24180257989976892
ROH>0.08: 0.19589754925980893
ROH>0.12: 0.19589754925980893
ROH>0.2: 0.0


In [44]:
df2[df2["lengthM"]>0.04]

Unnamed: 0,Start,End,StartM,EndM,iid,ch,length,lengthM
33246,72600000,75700000,0.810773,0.856678,/mnt/NEOGENE1/share/dna/hsa/trimmedbams/R2.bam...,5,3100000,0.045905
33274,2100000,12100000,0.036957,0.232855,/mnt/NEOGENE1/share/dna/hsa/trimmedbams/R2.bam...,11,10000000,0.195898


# Area 51

In [13]:
#df_m = pd.read_csv("./Empirical/1240k/MarcusAncs/combined_roh05.csv", sep="\t")
df_m = pd.read_csv("./Empirical/roh_all_inds_final_v42.1.csv", sep="\t")

In [27]:
df_m[df_m["iid"].str.contains("R2.SG")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
0,BR2.SG,Hungary_LBA.SG,5.221403,5.221403,1,0.0,0,0.0,0,0.0,...,19.95,3140.0,GambaNatureCommunications2014,Hungary_LBA.SG,19.164,1182644,1,Eastern Europe,blue,Agriculture
92,R2.SG,Italy_N.SG,18.837799,27.4036,3,18.837799,1,18.837799,1,0.0,...,13.54,7984.0,AntonioGaoMootsScience2019,Italy_N.SG,4.013658,1119222,1,Central Italy,blue,Agriculture


In [45]:
for c in [0.04,0.08,0.12,0.2]: 
    d = df_m[df_m["iid"].str.contains("I2521")][f"sum_roh>{int(c*100)}"].values[0]
    print(f"ROH>{c}: {d}")

ROH>0.04: 328.405093
ROH>0.08: 299.530998
ROH>0.12: 299.530998
ROH>0.2: 265.89269599999994


In [135]:
iids = df_m.sort_values(by="sum_roh>20", ascending=False)[:50]["iid"]

In [None]:
df_m.sort_values(by="sum_roh>20", ascending=False)[:50]

In [None]:
df_g = pd.read_csv("./Empirical/roh_all_inds_final_v42.1.csv", sep="\t")
df_g[df_g["iid"].str.contains("Kosten")]
df_g.head(3)

In [None]:
### Test whether Ceballos Individuals are covered
for iid in iids:
    print(iid)
    df2 = df1[df1["iid"].str.contains(iid)]
    print(len(df2))