In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
from pysam import AlignmentFile

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
    
elif socket_name.startswith("bionc"):
    print("Leipzig Cluster detected!")
    path = "/mnt/archgen/users/hringbauer/git/y_chrom/"
    
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

bionc21
Leipzig Cluster detected!
/mnt/archgen/users/hringbauer/git/y_chrom
CPU Count: 40
3.8.10 (default, Mar 15 2022, 12:22:08) 
[GCC 9.4.0]


### Load functions which are needed

In [2]:
def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all", "drop", "iid", "ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t


def load_snp_file_ISOGG(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number', unique=True):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs.
    unique: Whether to keep a """
    df_raw = pd.read_csv(path_snps)
    print(df_raw.columns)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["chrom"] = "Y"

    cols = ["Name", "chrom", "pos", "ref", "alt", 
            'Subgroup Name', 'Alternate Names', 'rs numbers']
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    if unique:
        idx_dup = df.duplicated(subset=["pos", "ref", "alt"], keep="first")
        df = df[~idx_dup]
        print(f"# Unique SNP positions: {len(df)}")
    
    ### Remove duplicate Names
    #idx_dup = df.duplicated(subset="Name", keep=False)
    #df = df[~idx_dup]
    #print(f"# Unique Names: {len(df)}")
    return df.copy().reset_index(drop=True)


################################################
### Calling Ys

def ref_alt_count(df_ch, bases=["A", "C", "G", "T"]):
    """Count Ref and Alt alleles in Dataframe df_ch
    with ref, alt, A, C, G, T fields and enter new columns
    ref# and alt#"""
    df_ch["ref#"]=0
    df_ch["alt#"]=0

    for p in bases:
        idx = df_ch["ref"] == p
        df_ch.loc[idx, "ref#"] = df_ch.loc[idx, p]

        idx = df_ch["alt"] == p
        df_ch.loc[idx, "alt#"] = df_ch.loc[idx, p]
    return df_ch

def pulldown_bamtable(path_bam = "", o_file = "",                   
                      bamtable = "/home/pruefer/bin/BamTable",
                      path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed"):
    """Pulldown a BAM at path_bam to o_file using bamtable and the bed a path_bed."""
    !$bamtable -F -A -f $path_bed $path_bam > $o_file
    

def call_y_bam(path_bam="", df=[],
               path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed",
               path_temp="/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv"):
    """Creates the Call Table from a .bam file"""
    
    ### Create the Pulldown
    pulldown_bamtable(path_bam = path_bam,
                      path_bed=path_bed,
                      o_file = path_temp)

    df1 = pd.read_csv(path_temp, sep="\t", header=None)
    df1.columns = ["chrom", "pos", "A", "C", "G", "T"]
    df2 = pd.merge(df, df1, on=["chrom", "pos"])
    
    ### Coverage Statistics
    cov = df1[["A", "C", "G", "T"]].values
    cov1 = np.sum(cov, axis=1)
    print(f"Average Coverage: {np.sum(cov1)/len(df):.4f}x")
    print(f"#Sites covered: {np.sum(cov1>0)}/{len(df)}")
    
    ### Establish Ref and Alt allele
    df_ch = ref_alt_count(df2, bases=["A", "C", "G", "T"])

    ### Identify Derived    
    idx_der = df_ch["alt#"]>df_ch["ref#"]
    print(f"#Derived Loci: \n{np.sum(idx_der)} / {np.sum(cov1>0)} covered>0")
    
    df_der = df_ch[idx_der].sort_values(by="Subgroup Name").reset_index(drop=True).copy()
    
    return df_ch, df_der 

# Create a bed file for ISOGG SNPs [one time requirement]

In [3]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

savepath = "./data/isogg_snps.bed"

dft = df[["chrom", "pos"]].copy()
dft["pos1"] = dft["pos"]
dft.to_csv(savepath, sep="\t", index=False, header=None)
print(f"Saved {len(dft)} ISOGG SNPs to {savepath}")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148
Saved 73148 ISOGG SNPs to ./data/isogg_snps.bed


# Load the BAM path [from Autorun]
Can use this dictionary to look up bam files of individuals

In [5]:
dft = pd.read_csv("/mnt/archgen/users/hringbauer/git/auto_popgen/output/v0.2/bam_paths.tsv", sep="\t")
n = np.sum(dft["bam#"]>0)
bam_dict = dict(zip(dft["iid"], dft["bam_path"]))
print(f"Loaded {len(dft)} Individuals. With BAM: {n}")

Loaded 1729 Individuals. With BAM: 1710


### Prepare the SNP list

In [6]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148


### Pulldown on a Test sample
Here: Use the Brienzi Y capture

In [254]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/brienziYcapture/A55903.bam")

Average Coverage: 12.5582x
#Sites covered: 60398/91806
#Derived Loci: 
1690 / 60398 covered>0
CPU times: user 222 ms, sys: 93.1 ms, total: 315 ms
Wall time: 6.47 s


In [None]:
### Browse the results - update the output index to "browse". The derived SNPs at the end are the most interesting
df_der[-50:]

In [225]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/brienziYcapture/A55903.bam")

Average Coverage: 15.7615x
#Sites covered: 60398/73148
#Derived Loci: 
1049 / 60398 covered>0
CPU times: user 217 ms, sys: 98 ms, total: 315 ms
Wall time: 6.71 s


In [None]:
df_der[-50:]

In [33]:
df_ch[df_ch["Subgroup Name"]=="R"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
206,M734,Y,18066156,C,T,R,PF6057;_S4;_YSC0000201,,0,0,0,1,0,1


### Test Georgian Individual

In [35]:
bam_dict["DDN001"]

'/mnt/archgen/Autorun_eager/eager_outputs/TF/DDN/DDN001/trimmed_bam/DDN001.A0101.trimmed.bam'

In [36]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam='/mnt/archgen/Autorun_eager/eager_outputs/TF/DDN/DDN001/trimmed_bam/DDN001.A0101.trimmed.bam')

Average Coverage: 0.0122x
#Sites covered: 859/73148
#Derived Loci: 
37 / 859 covered>0
CPU times: user 45.8 ms, sys: 25.8 ms, total: 71.7 ms
Wall time: 876 ms


In [37]:
df_der

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
0,L1128,Y,16428812,C,T,A0-T,,,0,0,0,1,0,1
1,A8847,Y,18894222,A,G,A00-T~,,,0,0,1,0,0,1
2,V1820,Y,8349696,G,C,A1b,,rs535186974,0,1,0,0,0,1
3,M6845,Y,2736008,G,A,B2b1b1b~,,,1,0,0,0,0,1
4,Z40384,Y,28787653,G,C,BT,,,0,2,0,0,0,2
5,M9240,Y,17940510,C,A,BT,,,1,0,0,0,0,1
6,M9336,Y,21646196,G,A,BT,Page26,,1,0,0,0,0,1
7,M9003,Y,7890655,C,G,BT,,,0,0,1,0,0,1
8,M9209,Y,17298439,G,A,BT,,,1,0,0,0,0,1
9,Y11708,Y,18704575,C,T,C1a2a1a2,Z30468,,0,0,0,1,0,1


### Test Malta Indivdiual

In [163]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam")

Average Coverage: 0.6477x
#Sites covered: 33545
#Derived Loci: 
539 / 33545 covered>0
CPU times: user 97.6 ms, sys: 36.9 ms, total: 134 ms
Wall time: 1.56 s


In [None]:
df_der["Subgroup Name"].str[:3].value_counts()

## Analysis: Browse the output Tables

In [None]:
df_der1[-150:-100]

In [None]:
df_t = df_ch1[df_ch1["Subgroup Name"]=="G2a1"].sort_values(by="Subgroup Name")[:40]
df_t

In [208]:
df_t[df_t["alt#"]>0]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
63388,P312,Y,22157311,C,A,R1b1a1b1a1a2,PF6547;_S116,,51,0,0,0,0,51
63951,BY188,Y,22474043,G,T,R1b1a1b1a1a2c1a1a1a1a1~,,,0,0,7,3,7,3
16892,PF1557.2,Y,8631875,C,A,R1b1a1b1a1a2c1a4b8~,,,1,6,0,0,6,1
38535,BY23092,Y,16635363,G,A,R1b1a1b1a1a2c1a5c3b1b2,,,1,0,13,0,13,1


In [269]:
df_ch[df_ch['Name']=="P312"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
51049,P312,Y,22157311,C,A,R1b1a1b1a1a2,PF6547;_S116,,82,0,0,0,0,82


In [29]:
df_ch[df_ch['Subgroup Name']=="R"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
18,PF5992,Y,2810583,A,G,R,CTS207;_M600,,0,0,1,0,0,1
378,CTS3622,Y,15078469,C,G,R,PF6037,,0,0,1,0,0,1
412,FGC1168,Y,15667208,G,C,R,,,0,1,0,0,0,1
548,CTS7876,Y,17722802,G,A,R,PF6052,,1,0,0,0,0,1


In [243]:
df_t.columns

Index(['Name', 'chrom', 'pos', 'ref', 'alt', 'Subgroup Name',
       'Alternate Names', 'rs numbers', 'A', 'C', 'G', 'T', 'ref#', 'alt#'],
      dtype='object')

### Extra: Test a Malta Individual as known case
According to publication: H2

In [172]:
%%time
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam", df=df)

Average Depth: 0.7174x
Derived Read Loci: 588/73148
CPU times: user 40.6 s, sys: 704 ms, total: 41.3 s
Wall time: 41.3 s


In [None]:
df_ch[idx_der].sort_values(by="Subgroup Name")[:50]

### Test One other Malta Indivdiual

In [161]:
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X9.bam")

Average Depth: 4.0776x
Derived Read Loci: 1208/73148


In [None]:
df_der[-100:-50]

# Run the Full ISOGG SNP set 

# Area 51

In [13]:
!samtools view /mnt/archgen/users/hringbauer/data/brienzi_capture/A55903.bam 1:33000000-34000000 | wc -l

3052


In [None]:
path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed"
path_bed = path_bed
df = pd.read_csv(path_bed, sep="\t")