In [122]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
from pysam import AlignmentFile

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
    
elif socket_name.startswith("bionc"):
    print("Leipzig Cluster detected!")
    path = "/mnt/archgen/users/hringbauer/git/y_chrom/"
    
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

bionc21
Leipzig Cluster detected!
/mnt/archgen/users/hringbauer/git/y_chrom
CPU Count: 40
3.8.10 (default, Nov 26 2021, 20:14:08) 
[GCC 9.3.0]


### Load functions which are needed

In [178]:
def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all", "drop", "iid", "ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t


def load_snp_file_ISOGG(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number'):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs"""
    df_raw = pd.read_csv(path_snps)
    print(df_raw.columns)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["chrom"] = "Y"

    cols = ["Name", "chrom", "pos", "ref", "alt", 
            'Subgroup Name', 'Alternate Names', 'rs numbers']
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    idx_dup = df.duplicated(subset=["pos", "ref", "alt"], keep="first")
    df = df[~idx_dup]
    print(f"# Unique SNP positions: {len(df)}")
    
    ### Remove duplicate Names
    #idx_dup = df.duplicated(subset="Name", keep=False)
    #df = df[~idx_dup]
    #print(f"# Unique Names: {len(df)}")
    return df.copy().reset_index(drop=True)


################################################
### Calling Ys

def counts_alleles_ch(df_snps, C, ch=2, bases=["A", "C", "G", "T"]):
    """Count alleles from Pysam object C against SNPs on chr ch
    in df_snps (1240K .snp dataframe).
    ch: Chromosome [int]. If 0, all autosomes (1,...,22)"""
    df_ch = df_snps[df_snps["chrom"]==ch].copy()

    counts = np.zeros((len(df_ch), 4), dtype="int8")

    for i, (ch, pos) in enumerate(df_ch[["chrom", "pos"]].values):
        counts[i,:] = np.array(C.count_coverage(contig=str(ch), start=pos-1, stop=pos))[:,0]
    for i, p in enumerate(bases):
        df_ch[p] = counts[:,i]
        
    df_ch = ref_alt_count(df_ch)
    return  df_ch

def ref_alt_count(df_ch, bases=["A", "C", "G", "T"]):
    """Count Ref and Alt alleles in Dataframe df_ch
    with ref, alt, A, C, G, T fields and enter new columns
    ref# and alt#"""
    df_ch["ref#"]=0
    df_ch["alt#"]=0

    for p in bases:
        idx = df_ch["ref"] == p
        df_ch.loc[idx, "ref#"] = df_ch.loc[idx, p]

        idx = df_ch["alt"] == p
        df_ch.loc[idx, "alt#"] = df_ch.loc[idx, p]
    return df_ch

def call_y_bam(path_bam="", df=[], ch="Y"):
    """Creates the Call Table from a .bam file"""
    C = AlignmentFile(path_bam)
    df_ch = counts_alleles_ch(df, C, ch=ch)
    
    cov = df_ch["ref#"] + df_ch["alt#"]
    avg_cov = np.mean(cov)
    print(f"Average Depth: {np.mean(avg_cov):.4f}x")
    
    
    idx_der = df_ch["alt#"]>df_ch["ref#"]
    print(f"Derived Read Loci: \n{np.sum(idx_der)} / {np.sum(cov>0)} covered>0 / {len(idx_der)} total")
    
    df_der = df_ch[idx_der].sort_values(by="Subgroup Name").reset_index(drop=True).copy()
    
    return df_ch, df_der

# Work in progress

# Do the Pulldown for Brienzi

In [179]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148


In [180]:
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/brienzi_capture/A55903.bam", df=df)

Average Depth: 1.2402x
Derived Read Loci: 
547 / 17345 covered>0 / 73148 total


In [187]:
df_der[-50:]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
497,PF6419,Y,6912992,T,G,R1b1a1b,CTS623,,0,0,4,0,0,4
498,PF6509,Y,22190371,A,G,R1b1a1b,,,0,0,2,0,0,2
499,PF6434,Y,8411202,A,G,R1b1a1b,,,0,0,4,0,0,4
500,PF6527,Y,24394612,G,A,R1b1a1b,,,1,0,0,0,0,1
501,L1351,Y,24444622,C,T,R1b1a1b,PF6528;_YSC0000240,,0,0,0,2,0,2
502,YSC0001293,Y,7073423,G,A,R1b1a1b,CTS894;_PF6420,,3,0,0,0,0,3
503,L753,Y,18865298,C,T,R1b1a1b,PF6486;_YSC0000018,,0,0,0,7,0,7
504,PF6435,Y,8667179,A,G,R1b1a1b,,,0,0,2,0,0,2
505,FGC42,Y,19417394,A,C,R1b1a1b,CTS10349;_PF6492,,0,2,0,0,0,2
506,PF6426,Y,7766712,T,C,R1b1a1b,,,0,4,0,0,0,4


In [196]:
df_ch[df_ch["Subgroup Name"].str.contains("R1b1a1b1a1a2c")].sort_values(by="Subgroup Name")[:50]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
71878,Z260,Y,24411932,G,T,R1b1a1b1a1a2c,,,0,0,0,0,0,0
72669,Z290,Y,28632468,G,C,R1b1a1b1a1a2c,S461,,0,0,0,0,0,0
63741,S245,Y,22200784,C,G,R1b1a1b1a1a2c,Z245,,0,0,0,0,0,0
2497,L459,Y,5275051,C,G,R1b1a1b1a1a2c1,,,0,9,0,0,9,0
33053,L21,Y,15654428,C,G,R1b1a1b1a1a2c1,M529;_S145,,0,33,0,0,33,0
46509,Z2542,Y,17885577,C,T,R1b1a1b1a1a2c1a,CTS8221,,0,3,0,0,3,0
1396,S521,Y,2836431,A,C,R1b1a1b1a1a2c1a,CTS241;_DF13,,0,0,0,0,0,0
65390,DF49,Y,22735599,G,A,R1b1a1b1a1a2c1a1a,S474,,0,0,0,0,0,0
47326,S6154,Y,17997565,T,C,R1b1a1b1a1a2c1a1a1,Z2980,,0,0,0,0,0,0
35184,S476,Y,15994422,C,G,R1b1a1b1a1a2c1a1a1a,Z2976,,0,0,0,0,0,0


### Test a Malta Individual as Baseline
According to publication: H2

In [172]:
%%time
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam", df=df)

Average Depth: 0.7174x
Derived Read Loci: 588/73148
CPU times: user 40.6 s, sys: 704 ms, total: 41.3 s
Wall time: 41.3 s


In [174]:
df_ch[idx_der].sort_values(by="Subgroup Name")[-100:-50]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
62819,Z18988,Y,22051923,T,G,H2a1~,,,0,0,1,0,0,1
72593,FGC3846,Y,28595821,T,C,H2a1~,,,0,2,0,0,0,2
66075,Z19004,Y,22854469,C,T,H2a1~,,,0,0,0,2,0,2
67513,Z19010,Y,23087173,G,A,H2a1~,,,1,0,0,0,0,1
7009,Z18835,Y,7214213,G,A,H2a1~,,,1,0,0,0,0,1
6148,Z18833,Y,7068890,C,T,H2a1~,,,0,0,0,2,0,2
73128,Z19031,Y,28800405,G,A,H2a1~,,,2,0,0,0,0,2
46237,Z18931,Y,17850026,G,C,H2a1~,,,0,1,0,0,0,1
45899,Z18929,Y,17796410,G,A,H2a1~,,,1,0,0,0,0,1
73122,Z19030,Y,28798163,G,C,H2a1~,,,0,2,1,0,1,2


### Test One other Malta Indivdiual

In [161]:
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X9.bam")

Average Depth: 4.0776x
Derived Read Loci: 1208/73148


In [167]:
df_der[-100:-50]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
1108,S1601.2,Y,14458744,A,G,G2a2a,Z6279.2,rs770388305,0,0,4,0,0,4
1109,S22816,Y,21288040,C,T,G2a2a,Z6479;_FGC2267,rs766320642,0,0,0,5,0,5
1110,PF3182,Y,21822756,C,T,G2a2a,,rs772973171,0,0,0,5,0,5
1111,PF3159,Y,14815695,C,G,G2a2a,,rs767074030,0,0,3,0,0,3
1112,PF3149,Y,7943188,A,G,G2a2a,,rs748733180,0,0,4,0,0,4
1113,PF3147,Y,7738069,G,A,G2a2a,,rs776853687,6,0,0,0,0,6
1114,PF6827,Y,2830780,A,G,G2a2a,,rs775179814,0,0,6,0,0,6
1115,Z6281,Y,15393669,A,C,G2a2a,,rs777782798,0,3,0,0,0,3
1116,PF3150,Y,8476569,T,C,G2a2a,,rs775742884,0,5,0,0,0,5
1117,PF3175,Y,18962113,C,T,G2a2a,,rs753053874,0,0,0,2,0,2


# Run the Full ISOGG SNP set 

# Area 51

In [None]:
### Print old Parfile
path = df1['Data: pulldown logfile location'].values[0]
file = open(path,mode='r')
txt = file.read()
print(txt)