# Run IBD on Punic Sample (including non-located indivdiuals)

In [1]:
import numpy as np
import pandas as pd
import os
import sys as sys
import socket
import matplotlib.pyplot as plt
import multiprocessing as mp
import itertools as it
#from adjustText import adjust_text

import warnings
warnings.filterwarnings("ignore") # Great Style
socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("No compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

sys.path.insert(0,"/n/groups/reich/hringbauer/git/hapBLOCK/package/")  # hack to get development package first in path
from ancIBD.run import hapBLOCK_chroms
from ancIBD.IO.batch_run import get_run_params_from_i, save_ibd_df
from ancIBD.IO.ind_ibd import create_ind_ibd_df, ind_all_ibd_df

compute-e-16-229.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28


## 1) Get the indivdiuals to run 

In [2]:
min_snp = 600000

df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v54.1.anno.csv")
print(f"Loaded {len(df_meta)} Indiviuals for meta data")

### Load Punic indivdiuals from Assignement
df_pun = pd.read_csv("./data/cluster_assignments_punic.v54.1.tsv", sep="\t")
df_pun = df_pun[df_pun["label"].str.contains("Punic")].copy()
print(f"\nLoaded {len(df_pun)} Punic Indiviudals")

### Merge Meta Data and Punic Indivdiuals
df_p = pd.merge(df_pun, df_meta, on="iid")
print(f"Merged in meta to {len(df_p)} Punic Indivdiuals")

df_ibd_iid = df_p[df_p["n_cov_snp"]>min_snp].reset_index(drop=True)
iids = df_ibd_iid["iid"].values
print(f"{len(df_ibd_iid)} Individuals with >{min_snp} 1240k SNPs covered")

Loaded 33967 Indiviuals for meta data

Loaded 138 Punic Indiviudals
Merged in meta to 138 Punic Indivdiuals
81 Individuals with >600000 1240k SNPs covered


In [30]:
df_ibd_iid["location"].value_counts()

Kerkouene     22
Tharros        9
Birgi          9
Cap Bon        8
Carthage       7
Lilybaeum      5
Motya          5
Eivissa        4
Malaga         4
Selinunte      3
Villaricos     3
Akhziv         1
Cadiz          1
Name: location, dtype: int64

## 2) Run the ancIBD IBD Inference

In [5]:
%%time
version = "v54.1"
folder_in =  f"/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_{version}/ch" # for hdf5
ch = 20
path_ibd = f'/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ch{ch}.tsv'
path_ibd

df_ibd = hapBLOCK_chroms(folder_in=folder_in,
                         iids=iids[:], run_iids=[],
                         ch=ch, folder_out="",
                         output=False, prefix_out='', logfile=False,
                         l_model='h5', e_model='haploid_gl2', h_model='FiveStateScaled', 
                         t_model='standard', p_col="variants/AF_ALL",
                         ibd_in=1, ibd_out=10, ibd_jump=400,
                         min_cm=6, cutoff_post=0.99, max_gap=0.0075,
                         processes=1)

save_ibd_df(df_ibd, savepath=path_ibd, create=False)

Saved 28 IBD blocks.
CPU times: user 34.4 s, sys: 337 ms, total: 34.8 s
Wall time: 35.7 s


In [8]:
for ch in range(1,23):
    print(f"Running ch: {ch}")
    version = "v54.1"
    folder_in =  f"/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_{version}/ch" # for hdf5
    path_ibd = f'/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ch{ch}.tsv'

    df_ibd = hapBLOCK_chroms(folder_in=folder_in,
                             iids=iids[:], run_iids=[],
                             ch=ch, folder_out="",
                             output=False, prefix_out='', logfile=False,
                             l_model='h5', e_model='haploid_gl2', h_model='FiveStateScaled', 
                             t_model='standard', p_col="variants/AF_ALL",
                             ibd_in=1, ibd_out=10, ibd_jump=400,
                             min_cm=6, cutoff_post=0.99, max_gap=0.0075,
                             processes=1)

    save_ibd_df(df_ibd, savepath=path_ibd, create=False)

Running ch: 1
Saved 574 IBD blocks.
Running ch: 2
Saved 76 IBD blocks.
Running ch: 3
Saved 47 IBD blocks.
Running ch: 4
Saved 50 IBD blocks.
Running ch: 5
Saved 37 IBD blocks.
Running ch: 6
Saved 39 IBD blocks.
Running ch: 7
Saved 56 IBD blocks.
Running ch: 8
Saved 58 IBD blocks.
Running ch: 9
Saved 40 IBD blocks.
Running ch: 10
Saved 75 IBD blocks.
Running ch: 11
Saved 31 IBD blocks.
Running ch: 12
Saved 38 IBD blocks.
Running ch: 13
Saved 28 IBD blocks.
Running ch: 14
Saved 70 IBD blocks.
Running ch: 15
Saved 360 IBD blocks.
Running ch: 16
Saved 30 IBD blocks.
Running ch: 17
Saved 45 IBD blocks.
Running ch: 18
Saved 36 IBD blocks.
Running ch: 19
Saved 43 IBD blocks.
Running ch: 20
Saved 28 IBD blocks.
Running ch: 21
Saved 113 IBD blocks.
Running ch: 22
Saved 33 IBD blocks.


# Post-process IBD Run

In [11]:
from ancIBD.IO.ind_ibd import create_ind_ibd_df, combine_all_chroms

In [13]:
combine_all_chroms(folder_base=f"/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ch",
                   path_save=f"/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ch_all.tsv")

Chromosome 1; Loaded 574 IBD
Chromosome 2; Loaded 76 IBD
Chromosome 3; Loaded 47 IBD
Chromosome 4; Loaded 50 IBD
Chromosome 5; Loaded 37 IBD
Chromosome 6; Loaded 39 IBD
Chromosome 7; Loaded 56 IBD
Chromosome 8; Loaded 58 IBD
Chromosome 9; Loaded 40 IBD
Chromosome 10; Loaded 75 IBD
Chromosome 11; Loaded 31 IBD
Chromosome 12; Loaded 38 IBD
Chromosome 13; Loaded 28 IBD
Chromosome 14; Loaded 70 IBD
Chromosome 15; Loaded 360 IBD
Chromosome 16; Loaded 30 IBD
Chromosome 17; Loaded 45 IBD
Chromosome 18; Loaded 36 IBD
Chromosome 19; Loaded 43 IBD
Chromosome 20; Loaded 28 IBD
Chromosome 21; Loaded 113 IBD
Chromosome 22; Loaded 33 IBD
Saved 1907 IBD to /n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/v54.1.ch_all.tsv.


In [14]:
%%time

### Takes about 1 min
df_res = create_ind_ibd_df(ibd_data = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ch_all.tsv",
                      min_cms = [8, 12, 16, 20], snp_cm = 220, min_cm = 5, sort_col = 0,
                      savepath = f"/n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/{version}.ibd_ind.d220.tsv")

> 5 cM: 1907/1907
Of these with suff. SNPs per cM> 220:               600/1907
2     59
1     55
3     41
7     40
6     38
10    34
4     32
5     31
11    31
9     30
8     30
12    27
13    21
17    21
14    18
21    17
16    17
18    17
20    17
15    16
19     4
22     4
Name: ch, dtype: int64
Saved 245 individual IBD pairs to: /n/groups/reich/hringbauer/git/punic_aDNA/output/ibd/v54.1.ibd_ind.d220.tsv
CPU times: user 566 ms, sys: 7.03 ms, total: 573 ms
Wall time: 598 ms


# Area 51
Try out code here