In [196]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
from pysam import AlignmentFile

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
    
elif socket_name.startswith("bionc"):
    print("Leipzig Cluster detected!")
    path = "/mnt/archgen/users/hringbauer/git/y_chrom/"
    
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

bionc21
Leipzig Cluster detected!
/mnt/archgen/users/hringbauer/git/y_chrom
CPU Count: 40
3.8.10 (default, Nov 26 2021, 20:14:08) 
[GCC 9.3.0]


### Load functions which are needed

In [197]:
def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all", "drop", "iid", "ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t


def load_snp_file_ISOGG(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number'):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs"""
    df_raw = pd.read_csv(path_snps)
    print(df_raw.columns)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["chrom"] = "Y"

    cols = ["Name", "chrom", "pos", "ref", "alt", 
            'Subgroup Name', 'Alternate Names', 'rs numbers']
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    idx_dup = df.duplicated(subset=["pos", "ref", "alt"], keep="first")
    df = df[~idx_dup]
    print(f"# Unique SNP positions: {len(df)}")
    
    ### Remove duplicate Names
    #idx_dup = df.duplicated(subset="Name", keep=False)
    #df = df[~idx_dup]
    #print(f"# Unique Names: {len(df)}")
    return df.copy().reset_index(drop=True)


################################################
### Calling Ys

def counts_alleles_ch(df_snps, C, ch=2, bases=["A", "C", "G", "T"]):
    """Count alleles from Pysam object C against SNPs on chr ch
    in df_snps (1240K .snp dataframe).
    ch: Chromosome [int]. If 0, all autosomes (1,...,22)"""
    df_ch = df_snps[df_snps["chrom"]==ch].copy()

    counts = np.zeros((len(df_ch), 4), dtype="int8")

    for i, (ch, pos) in enumerate(df_ch[["chrom", "pos"]].values):
        counts[i,:] = np.array(C.count_coverage(contig=str(ch), start=pos-1, stop=pos))[:,0]
    for i, p in enumerate(bases):
        df_ch[p] = counts[:,i]
        
    df_ch = ref_alt_count(df_ch)
    return  df_ch

def ref_alt_count(df_ch, bases=["A", "C", "G", "T"]):
    """Count Ref and Alt alleles in Dataframe df_ch
    with ref, alt, A, C, G, T fields and enter new columns
    ref# and alt#"""
    df_ch["ref#"]=0
    df_ch["alt#"]=0

    for p in bases:
        idx = df_ch["ref"] == p
        df_ch.loc[idx, "ref#"] = df_ch.loc[idx, p]

        idx = df_ch["alt"] == p
        df_ch.loc[idx, "alt#"] = df_ch.loc[idx, p]
    return df_ch

def call_y_bam(path_bam="", df=[], ch="Y"):
    """Creates the Call Table from a .bam file"""
    C = AlignmentFile(path_bam)
    df_ch = counts_alleles_ch(df, C, ch=ch)
    
    cov = df_ch["ref#"] + df_ch["alt#"]
    avg_cov = np.mean(cov)
    print(f"Average Depth: {np.mean(avg_cov):.4f}x")
    
    
    idx_der = df_ch["alt#"]>df_ch["ref#"]
    print(f"Derived Read Loci: \n{np.sum(idx_der)} / {np.sum(cov>0)} covered>0 / {len(idx_der)} total")
    
    df_der = df_ch[idx_der].sort_values(by="Subgroup Name").reset_index(drop=True).copy()
    
    return df_ch, df_der

# Load the BAM path [from Autorun]
Can use this dictionary to look up bam files of individuals

In [199]:
dft = pd.read_csv("/mnt/archgen/users/hringbauer/git/auto_popgen/output/v0/bam_paths.tsv", sep="\t")
n = np.sum(dft["bam#"]>0)
bam_dict = dict(zip(dft["iid"], dft["bam_path"]))
print(f"Loaded {len(dft)} Individuals. With BAM: {n}")

Loaded 2846 Individuals. With BAM: 1251


### Prepare the SNP list

In [200]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148


In [7]:
### Do the Y bam
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/brienzi_capture/A55903.bam", df=df)

Average Depth: 1.2402x
Derived Read Loci: 
547 / 17345 covered>0 / 73148 total


### Pulldown on a sample from Georgia

In [205]:
path_bam = bam_dict["TSV008"]
path_bam

'/mnt/archgen/Autorun_eager/eager_outputs/TF/TSV008/trimmed_bam/TSV008.A0101.trimmed.bam'

In [206]:
%%time
### Do the Y capture
df_ch1, df_der1 = call_y_bam(path_bam=path_bam, df=df)

Average Depth: 0.2295x
Derived Read Loci: 
511 / 11127 covered>0 / 73148 total
CPU times: user 14.4 s, sys: 104 ms, total: 14.5 s
Wall time: 14.8 s


## Analysis: Browse the output Tables

In [209]:
df_der1[-150:-100]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
361,PF3141,Y,23973594,T,G,G2a,U5,rs2178500,0,0,1,0,0,1
362,F3088,Y,20813445,G,A,G2a,PF3043,rs773017696,2,0,0,0,0,2
363,F1980,Y,15660640,C,T,G2a,M3309;_PF2972,rs549001775,0,0,0,1,0,1
364,S149,Y,14028148,C,A,G2a,L31;_PF3142,rs35617575,1,0,0,0,0,1
365,M3307,Y,15588776,A,C,G2a,F1975;_PF2969,rs541981761,0,1,0,0,0,1
366,PF3112,Y,23244026,C,T,G2a,P15,rs370167410,0,0,0,2,0,2
367,M3397,Y,21605685,G,C,G2a,PF3060,rs574243059,0,1,0,0,0,1
368,FGC635,Y,23165969,A,T,G2a1,Z6738,rs889435258,0,0,0,2,0,2
369,FGC666,Y,6716379,G,A,G2a1,Z6627,rs776116051,1,0,0,0,0,1
370,Z6529,Y,6678268,C,T,G2a1,FGC585;_SK1113,rs752933358,0,0,0,2,0,2


In [210]:
df_t = df_ch1[df_ch1["Subgroup Name"].str.contains("G2a1a1a")].sort_values(by="Subgroup Name")[:40]
df_t

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
73113,Z6747,Y,28790097,A,C,G2a1a1a,FGC781,,0,0,0,0,0,0
53667,FGC746,Y,19194266,C,T,G2a1a1a,Z6702,rs778878868,0,0,0,0,0,0
52191,FGC742,Y,18991247,G,C,G2a1a1a,Z6700,rs1013297795,0,0,0,0,0,0
51709,Z6699,Y,18918472,G,T,G2a1a1a,FGC741,rs978512958,0,0,0,1,0,1
49190,Z6696,Y,18382172,C,T,G2a1a1a,FGC737,rs1035513718,0,0,0,0,0,0
47924,Z6682,Y,18084035,G,T,G2a1a1a,SK1114;_FGC735,rs752041400,0,0,0,0,0,0
3901,Z6628,Y,6781321,G,A,G2a1a1a,FGC667,rs767797024,1,0,0,0,0,1
42338,Z6690,Y,17278478,C,T,G2a1a1a,FGC727,rs745764343,0,0,0,0,0,0
5861,Z6629,Y,7031739,T,C,G2a1a1a,FGC670,rs779258931,0,4,0,0,0,4
29856,FGC714,Y,15049551,C,T,G2a1a1a,Z6678,rs1031119093,0,1,0,0,1,0


In [214]:
df_t = df_ch1[df_ch1["Subgroup Name"]=="G2a1"].sort_values(by="Subgroup Name")[:40]
df_t

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
3135,Z6529,Y,6678268,C,T,G2a1,FGC585;_SK1113,rs752933358,0,0,0,2,0,2
65669,Z6737,Y,22788174,C,T,G2a1,FGC769,rs983527530,0,0,0,0,0,0
64506,Z6735,Y,22578276,C,T,G2a1,FGC633,rs1042173727,0,0,0,0,0,0
56386,Z6594,Y,21040508,C,T,G2a1,FGC7542;_SK1112,rs1027341012,0,0,0,0,0,0
54232,FGC748,Y,19271171,T,C,G2a1,Z6704,,0,0,0,0,0,0
51552,Z6584,Y,18892741,A,C,G2a1,FGC622,rs770273706,0,0,0,0,0,0
30341,FGC607,Y,15145612,A,G,G2a1,Z6571,rs768693859,0,0,0,0,0,0
26066,FGC7538,Y,14376925,A,T,G2a1,Z6566,rs749251693,0,0,0,0,0,0
20209,Z6557,Y,9826351,A,G,G2a1,FGC1027,rs759653009,0,0,0,0,0,0
19955,FGC597,Y,9759841,C,T,G2a1,SK1109;_Z6556,rs908987487,0,0,0,1,0,1


In [208]:
df_t[df_t["alt#"]>0]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
63388,P312,Y,22157311,C,A,R1b1a1b1a1a2,PF6547;_S116,,51,0,0,0,0,51
63951,BY188,Y,22474043,G,T,R1b1a1b1a1a2c1a1a1a1a1~,,,0,0,7,3,7,3
16892,PF1557.2,Y,8631875,C,A,R1b1a1b1a1a2c1a4b8~,,,1,6,0,0,6,1
38535,BY23092,Y,16635363,G,A,R1b1a1b1a1a2c1a5c3b1b2,,,1,0,13,0,13,1


In [245]:
df_ch[df_ch['Name'].str.contains("M269")]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
59864,M2696,Y,21571895,G,A,IJK,PF3500,,1,0,0,0,0,1


In [243]:
df_t.columns

Index(['Name', 'chrom', 'pos', 'ref', 'alt', 'Subgroup Name',
       'Alternate Names', 'rs numbers', 'A', 'C', 'G', 'T', 'ref#', 'alt#'],
      dtype='object')

### Extra: Test a Malta Individual as known case
According to publication: H2

In [172]:
%%time
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam", df=df)

Average Depth: 0.7174x
Derived Read Loci: 588/73148
CPU times: user 40.6 s, sys: 704 ms, total: 41.3 s
Wall time: 41.3 s


In [227]:
df_ch[idx_der].sort_values(by="Subgroup Name")[:50]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
48397,L1135,Y,18147303,C,A,A0-T,,,3,0,0,0,0,3
63684,L1155,Y,22191266,G,C,A0-T,,,0,1,0,0,0,1
4479,L1101,Y,6859819,T,C,A0-T,,,0,0,0,0,0,0
9116,L1105,Y,7590048,C,T,A0-T,,,0,0,0,13,0,13
4366,L1098,Y,6847637,C,A,A0-T,,,0,0,0,0,0,0
39016,L1132,Y,16718811,A,G,A0-T,,,0,0,0,0,0,0
60800,L1145,Y,21739790,C,T,A0-T,,,0,0,0,2,0,2
22680,L1116,Y,13888035,A,C,A0-T,,,0,0,0,0,0,0
2409,L1093,Y,4862550,A,G,A0-T,,,0,0,0,0,0,0
2167,L1090,Y,3544962,G,C,A0-T,,,0,0,0,0,0,0


### Test One other Malta Indivdiual

In [161]:
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X9.bam")

Average Depth: 4.0776x
Derived Read Loci: 1208/73148


In [None]:
df_der[-100:-50]

# Run the Full ISOGG SNP set 

# Area 51

In [13]:
!samtools view /mnt/archgen/users/hringbauer/data/brienzi_capture/A55903.bam 1:33000000-34000000 | wc -l

3052
