In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-237.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


### Define Parameters for param file

In [16]:
def create_parfile_from_ref(df, iid,
                            par_dc, 
                            savepath = "/n/groups/reich/hringbauer/git/y_chrom/parfiles/",
                            outfolder="",
                            col_id = 'Version ID',
                            use_fullisogg=False):
    """Create a new parfile from reference dataframe df
    iid: What individual to prepare
    id_col: What column to use in anno file for IID
    savepath: Path to save the new parfile to
    outfolder: p"""
    bam_path = extram_bam_df(df, iid=iid, col_id=col_id)
    
    if use_fullisogg:
        par_dc["snpname"] =  "/n/groups/reich/hringbauer/git/y_chrom/data/eigenstrat/y_snps_all2020.snp"
        par_dc["readdepthname"] = '/n/groups/reich/hringbauer/git/y_chrom/output/pulldowns_all/SAMP.txt'
    
    if os.path.isdir(bam_path):
        bam_path = os.path.join(bam_path, "*Y.bam")  # Append the Y BAM
        print(bam_path)
    #else:
        #bam_path = bam_path + ".by_chr/*Y.bam"   # Hack for Ning samples. Not clear how general
        #print("Warning. Ning2020 hack run. General?")
    savepath = create_parfile(par_dc, sample=iid, bam_path=bam_path,
                   savepath=savepath, outfolder=outfolder)
    return savepath

def create_parfile(par_dc, sample="test", bam_path="", outfolder="",
                   savepath="/n/groups/reich/hringbauer/git/y_chrom/parfiles/"):
    """Creates parfile from dictonary of Parameters
    savepath: Where to save the parifle to"""
    ### Modify Parameter Dictionary
    par_dc["SAMP"] = sample
    par_dc["bamname"] = bam_path
    
    if len(outfolder)>0:
        par_dc["readdepthname"] = os.path.join(outfolder ,"SAMP.txt")
        
    
    ### Save to File
    savepath = os.path.join(savepath,sample)
    with open(savepath,"w") as f:
        for k, v in par_dc.items():        
            f.write(f"{k}: {v}\n")
    print(f"Successfully saved parfile to {savepath}")
    return savepath
    
def run_pulldown_y(path_parfile, path_bin = "./scripts/pulldown", output=True):
    """Run Pulldown via par file.
    path_parfile: Which parfile to run
    path_bin: Where to find the binary to run it"""
    if output:
        ! $path_bin -p $path_parfile
    else:
        ! $path_bin -p $path_parfile  > /dev/null
    
def extram_bam_df(df, iid,
                  col_id = 'Instance ID ("_all" means includes a mix of UDG-treated and non-UDG-treated; "_published" distinguishes a published sample for a still-unpublished higher quality version)', 
                  col_bam = 'Data: autosomal bam'):
    """Return bam Path from Meta File"""
    idx = (df[col_id]==iid)
    assert(np.sum(idx)>0) 
    bam_path = df.loc[idx, col_bam].values[0]
    return bam_path

def give_default_par_dc():
    """Return the default parameter dictionary"""
    par_dc = {"BASE": "/n/groups/reich/matt/pipeline/static",
          "SAMP": "FILL_IN",
          "D2": "./output",
          "indivname": "/n/groups/reich/DAVID/V43/V43.5/v43.5.ind",
          "snpname": "/n/groups/reich/matt/pipeline/static/1240kSNP.snp",
          "bamname": "FILL_IN",
          "threshtable": "/n/groups/reich/matt/pipeline/static/pulldown_thresholds",       
          "defstring": "capture_half",
          "readbam": "/n/groups/reich/matt/pipeline/static/readbam",
          "oldpullmode": "YES",
          "sampname": "SAMP",
          "printcount": "NO",
          "majmode": "NO",
          "rgonly": "NO",
          "chrom": "24",
          "readdepthname": "/n/groups/reich/hringbauer/git/y_chrom/output/SAMP.txt"
          }
    return par_dc

### Run for a single Individual

In [12]:
%%time
#anno_path = "/n/groups/reich/hringbauer/explore_ntbk/v43/V43/v42.3.anno"
anno_path = "/n/groups/reich/DAVID/V43/V43.5/v43.5.anno"
df = pd.read_csv(anno_path, sep='\t', low_memory=False)

par_dc = give_default_par_dc()
### Create a parfile to run
savepath = create_parfile_from_ref(df=df, 
                                   iid="I10266",  #I10266
                                   par_dc=par_dc)
### Produce Lots of Output (toggled)
run_pulldown_y(path_parfile=savepath)

/n/data1/hms/genetics/reich/1000Genomes/amh_samples/ancientMergeSets__CAPTURE/C-per_sample_versions/I10266/v0030.2__2018_05_02/merged/by_chr/*Y.bam
Successfully saved parfile to /n/groups/reich/hringbauer/git/y_chrom/parfiles/I10266
parameter file: /n/groups/reich/hringbauer/git/y_chrom/parfiles/I10266
BASE: /n/groups/reich/matt/pipeline/static
SAMP: I10266
D2: ./output
indivname: /n/groups/reich/DAVID/V43/V43.5/v43.5.ind
snpname: /n/groups/reich/matt/pipeline/static/1240kSNP.snp
bamname: /n/data1/hms/genetics/reich/1000Genomes/amh_samples/ancientMergeSets__CAPTURE/C-per_sample_versions/I10266/v0030.2__2018_05_02/merged/by_chr/*Y.bam
threshtable: /n/groups/reich/matt/pipeline/static/pulldown_thresholds
defstring: capture_half
readbam: /n/groups/reich/matt/pipeline/static/readbam
oldpullmode: YES
sampname: I10266
printcount: NO
majmode: NO
rgonly: NO
chrom: 24
readdepthname: /n/groups/reich/hringbauer/git/y_chrom/output/I10266.txt
version: 2400
majmode unset
old pullmode
seed: 190917594

### Do Multiple Individuals

In [32]:
#iids = ["I7116", "I7121", "I7120"]  # African R1b
#iids = ["R6.SG", "I18872", "I15941","I8016","I6752", "I16339", "I14337", "I8057", "I7116"] #R1b-V88 canditates
iids = ["SEC002"]


for iid in iids:
    print(f"Doing {iid}")
    savepath = create_parfile_from_ref(df=df, iid=iid,
                            par_dc=par_dc)
    
    run_pulldown_y(path_parfile=savepath, output=False)

Doing SEC002
/n/data1/hms/genetics/reich/1000Genomes/amh_samples/marcus2019_sardinia_samples/A-round1/B-fix/SEC002/hg19/SEC002.md_no1kgvar.bam.by_chr/*Y.bam
Successfully saved to /n/groups/reich/hringbauer/git/y_chrom/parfiles/SEC002
/home/np29/o2bin/samtools view /n/data1/hms/genetics/reich/1000Genomes/amh_samples/marcus2019_sardinia_samples/A-round1/B-fix/SEC002/hg19/SEC002.md_no1kgvar.bam.by_chr/*Y.bam Y 


# Run the Punic Individuals

In [None]:
#punic_df = pd.read_csv("../punic_aDNA/data/males_feb20.csv")
punic_df = pd.read_csv("../punic_aDNA/data/males_sic_feb20.csv")

male_punic_iids = punic_df.iloc[:,0].values

for iid in male_punic_iids:
    print(f"Doing {iid}")
    savepath = create_parfile_from_ref(df=df, iid=iid,
                            par_dc=par_dc)
    
    run_pulldown_y(path_parfile=savepath, output=False)

### Run the Ning Individuals

In [17]:
iid_path = "/n/groups/reich/hringbauer/git/tibet_aDNA/Data/iids_male_ning2020.csv"
iids = np.loadtxt(iid_path, dtype="str")
print(f"Loaded {len(iids)} from {iid_path}")

anno_path = "/n/groups/reich/DAVID/V43/V43.5/v43.5.anno"
df = pd.read_csv(anno_path, sep='\t', low_memory=False)

Loaded 55 from /n/groups/reich/hringbauer/git/tibet_aDNA/Data/iids_male_ning2020.csv


In [19]:
%%time
par_dc = give_default_par_dc()
### Create a parfile to run
for iid in iids[:1]:
    ### Produce the Y chromosome
    savepath = create_parfile_from_ref(df=df, 
                                       iid="MA110",  #I10266
                                       par_dc=par_dc,
                                       outfolder="./output/ning20/")
    run_pulldown_y(path_parfile=savepath) # Run the Pulldown

/n/data1/hms/genetics/reich/1000Genomes/amh_samples/marcus2019_sardinia_samples/A-round1/B-fix/MA110/hg19/MA110.md_no1kgvar.bam.by_chr/*Y.bam
Successfully saved parfile to /n/groups/reich/hringbauer/git/y_chrom/parfiles/MA110
parameter file: /n/groups/reich/hringbauer/git/y_chrom/parfiles/MA110
BASE: /n/groups/reich/matt/pipeline/static
SAMP: MA110
D2: ./output
indivname: /n/groups/reich/DAVID/V43/V43.5/v43.5.ind
snpname: /n/groups/reich/matt/pipeline/static/1240kSNP.snp
bamname: /n/data1/hms/genetics/reich/1000Genomes/amh_samples/marcus2019_sardinia_samples/A-round1/B-fix/MA110/hg19/MA110.md_no1kgvar.bam.by_chr/*Y.bam
threshtable: /n/groups/reich/matt/pipeline/static/pulldown_thresholds
defstring: capture_half
readbam: /n/groups/reich/matt/pipeline/static/readbam
oldpullmode: YES
sampname: MA110
printcount: NO
majmode: NO
rgonly: NO
chrom: 24
readdepthname: ./output/ning20/MA110.txt
version: 2400
majmode unset
old pullmode
seed: 1438217005
##start:paramfile
BASE: /n/groups/reich/matt/

In [15]:
%%time
par_dc = give_default_par_dc()
### Create a parfile to run
for iid in iids[:1]:
    ### Produce the Y chromosome
    savepath = create_parfile_from_ref(df=df, 
                                       iid=iid,  #I10266
                                       par_dc=par_dc,
                                       outfolder="./output/ning20/")
    run_pulldown_y(path_parfile=savepath) # Run the Pulldown

Successfully saved parfile to /n/groups/reich/hringbauer/git/y_chrom/parfiles/ZLNR-2
parameter file: /n/groups/reich/hringbauer/git/y_chrom/parfiles/ZLNR-2
BASE: /n/groups/reich/matt/pipeline/static
SAMP: ZLNR-2
D2: ./output
indivname: /n/groups/reich/DAVID/V43/V43.5/v43.5.ind
snpname: /n/groups/reich/matt/pipeline/static/1240kSNP.snp
bamname: /n/data1/hms/genetics/reich/1000Genomes/amh_samples/ning2020__eastAsian_samples/A-round1/B-fix/ZLNR-2/hg19/aln.mapped.rmdupse.md_no1kgvar.rg.bam
threshtable: /n/groups/reich/matt/pipeline/static/pulldown_thresholds
defstring: capture_half
readbam: /n/groups/reich/matt/pipeline/static/readbam
oldpullmode: YES
sampname: ZLNR-2
printcount: NO
majmode: NO
rgonly: NO
chrom: 24
readdepthname: ./output/ning20/ZLNR-2.txt
version: 2400
majmode unset
old pullmode
seed: 1695689265
##start:paramfile
BASE: /n/groups/reich/matt/pipeline/static
SAMP: ZLNR-2
D2: ./output
indivname: /n/groups/reich/DAVID/V43/V43.5/v43.5.ind
snpname: /n/groups/reich/matt/pipeline/

# Run the Full ISOGG SNP set 

In [7]:
%%time
### Create a parfile to run
savepath = create_parfile_from_ref(df=df, 
                                   iid="Loschbour_published.DG",  #I10266
                                   par_dc=par_dc,
                                   
                                   use_fullisogg=True)
### Produce Lots of Output (toggled)
run_pulldown_y(path_parfile=savepath)

../*Y.bam
Successfully saved to /n/groups/reich/hringbauer/git/y_chrom/parfiles/Loschbour_published.DG
parameter file: /n/groups/reich/hringbauer/git/y_chrom/parfiles/Loschbour_published.DG
BASE: /n/groups/reich/matt/pipeline/static
SAMP: Loschbour_published.DG
D2: ./output
indivname: /n/groups/reich/hringbauer/explore_ntbk/v42/V42.3/v42.3.ind
snpname: /n/groups/reich/hringbauer/git/y_chrom/data/eigenstrat/y_snps_all2020.snp
bamname: ../*Y.bam
threshtable: /n/groups/reich/matt/pipeline/static/pulldown_thresholds
defstring: capture_half
readbam: /n/groups/reich/matt/pipeline/static/readbam
oldpullmode: YES
sampname: Loschbour_published.DG
printcount: NO
majmode: NO
rgonly: NO
chrom: 24
readdepthname: /n/groups/reich/hringbauer/git/y_chrom/output/pulldowns_all/Loschbour_published.DG.txt
version: 2400
majmode unset
old pullmode
seed: 1354517211
##start:paramfile
BASE: /n/groups/reich/matt/pipeline/static
SAMP: Loschbour_published.DG
D2: ./output
indivname: /n/groups/reich/hringbauer/explo

# Area 51

In [None]:
### Print old Parfile
path = df1['Data: pulldown logfile location'].values[0]
file = open(path,mode='r')
txt = file.read()
print(txt)