# Call Y Haplogroups
Leipzig Cluster, Using fast pulldown

In [312]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
from pysam import AlignmentFile

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
    
elif socket_name.startswith("bionc"):
    print("Leipzig Cluster detected!")
    path = "/mnt/archgen/users/hringbauer/git/y_chrom/"
    
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

bionc21
Leipzig Cluster detected!
/mnt/archgen/users/hringbauer/git/y_chrom
CPU Count: 40
3.8.10 (default, May 26 2023, 14:05:08) 
[GCC 9.4.0]


### Load functions which are needed

In [313]:
def load_counts(path_counts, coerce=True):
    """Load Count file and return Dataframe"""
    df_t = pd.read_csv(path_counts, header=None, delim_whitespace=True)
    df_t.columns = ["snp", "chr", "pos", "ref_all", "alt_all", "drop", "iid", "ref", "alt"]
    
    if coerce:
        for col in ["pos", "ref", "alt"]:
            df_t[col] = pd.to_numeric(df_t[col], errors="coerce")
            
    df_t = df_t.drop(columns="drop")
    return df_t

def load_snp_file_ISOGG(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number', unique=True):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs.
    unique: Whether to keep a """
    df_raw = pd.read_csv(path_snps)
    print(df_raw.columns)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["chrom"] = "Y"

    cols = ["Name", "chrom", "pos", "ref", "alt", 
            'Subgroup Name', 'Alternate Names', 'rs numbers']
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    if unique:
        idx_dup = df.duplicated(subset=["pos", "ref", "alt"], keep="first")
        df = df[~idx_dup]
        print(f"# Unique SNP positions: {len(df)}")
    
    ### Remove duplicate Names
    #idx_dup = df.duplicated(subset="Name", keep=False)
    #df = df[~idx_dup]
    #print(f"# Unique Names: {len(df)}")
    return df.copy().reset_index(drop=True)


################################################
### Calling Ys

def ref_alt_count(df_ch, bases=["A", "C", "G", "T"]):
    """Count Ref and Alt alleles in Dataframe df_ch
    with ref, alt, A, C, G, T fields and enter new columns
    ref# and alt#"""
    df_ch["ref#"]=0
    df_ch["alt#"]=0

    for p in bases:
        idx = df_ch["ref"] == p
        df_ch.loc[idx, "ref#"] = df_ch.loc[idx, p]

        idx = df_ch["alt"] == p
        df_ch.loc[idx, "alt#"] = df_ch.loc[idx, p]
    return df_ch

def pulldown_bamtable(path_bam = "", o_file = "",                   
                      bamtable = "/home/pruefer/bin/BamTable",
                      path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed"):
    """Pulldown a BAM at path_bam to o_file using bamtable and the bed a path_bed."""
    !$bamtable -F -A -f $path_bed $path_bam > $o_file
    

def call_y_bam(path_bam="", df=[],
               path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed",
               path_temp="/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv"):
    """Creates the Call Table from a .bam file"""
    
    ### Create the Pulldown
    pulldown_bamtable(path_bam = path_bam,
                      path_bed = path_bed,
                      o_file = path_temp)

    df1 = pd.read_csv(path_temp, sep="\t", header=None)
    df1.columns = ["chrom", "pos", "A", "C", "G", "T"]
    idx = df1["chrom"]=="chrY"
    if np.sum(idx)>0:
        print(f"Changing {np.sum(idx)} ChrY -> Y")
        df1.loc[idx, "chrom"] = "Y"
        
    df2 = pd.merge(df, df1, on=["chrom", "pos"])
    
    ### Coverage Statistics
    cov = df1[["A", "C", "G", "T"]].values
    cov1 = np.sum(cov, axis=1)
    print(f"Average Coverage: {np.sum(cov1)/len(df):.4f}x")
    print(f"#Sites covered: {np.sum(cov1>0)}/{len(df)}")
    
    ### Establish Ref and Alt allele
    df_ch = ref_alt_count(df2, bases=["A", "C", "G", "T"])

    ### Identify Derived    
    idx_der = df_ch["alt#"]>df_ch["ref#"]
    print(f"#Derived Loci: \n{np.sum(idx_der)} / {np.sum(cov1>0)} covered>0")
    
    df_der = df_ch[idx_der].sort_values(by="Subgroup Name").reset_index(drop=True).copy()
    
    return df_ch, df_der 

def mismatch_path(s, df):
    """Look for all mismatches in path up to s"""
    ls = [s, s+"~"]

    for i in range(1,len(s)):
        ls+= [s[:-i], s[:-i]+"~"]

    dft = df[df["Subgroup Name"].isin(ls)]
    idx = dft["ref#"]>=dft["alt#"]
    print(f"Mismatches: {np.sum(idx)} / {len(idx)}")
    return dft

# Create a bed file for ISOGG SNPs [one time requirement]

In [200]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

savepath = "./data/isogg_snps.bed"

dft = df[["chrom", "pos"]].copy()
dft["pos1"] = dft["pos"]
dft.to_csv(savepath, sep="\t", index=False, header=None)
print(f"Saved {len(dft)} ISOGG SNPs to {savepath}")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148
Saved 73148 ISOGG SNPs to ./data/isogg_snps.bed


# Load the BAM path [from Autorun]
Can use this dictionary to look up bam files of individuals

In [179]:
dft = pd.read_csv("/mnt/archgen/users/hringbauer/git/auto_popgen/output/v0.3/bam_paths.tsv", sep="\t")
n = np.sum(dft["bam#"]>0)
bam_dict = dict(zip(dft["iid"], dft["bam_path"]))
print(f"Loaded {len(dft)} Individuals. With BAM: {n}")

Loaded 16333 Individuals. With BAM: 12885


In [181]:
bam_dict["DBH001"]

'/mnt/archgen/Autorun_eager/eager_outputs/TF/DBH/DBH001/merged_bams/initial/DBH001_ss_udgnone_libmerged_rmdup.bam'

### Prepare the SNP list

In [201]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148


### Run the Y haplogroup calling

In [322]:
%%time
path_bam = bam_dict["PTN003"]

df_ch, df_der = call_y_bam(df=df, 
                           path_bam=path_bam) #A55903 and A55904

Average Coverage: 0.4226x
#Sites covered: 18146/73148
#Derived Loci: 
645 / 18146 covered>0
CPU times: user 67.8 ms, sys: 25.9 ms, total: 93.7 ms
Wall time: 979 ms


In [203]:
"J2a1a1b3"

'/mnt/archgen/Autorun_eager/eager_outputs/TF/ABU/ABU006/merged_bams/initial/ABU006_ss_udgnone_libmerged_rmdup.bam'

In [323]:
df_der[-50:]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
595,PF6452,Y,14136291,G,A,R1b1a1b,YSC0000167,,1,0,0,0,0,1
596,M11805,Y,13945593,A,T,R1b1a1b,PF6447,,0,0,0,1,0,1
597,PF6444,Y,13816025,G,A,R1b1a1b,,,1,0,0,0,0,1
598,YSC0000248,Y,13657777,T,C,R1b1a1b,L777,,0,1,0,0,0,1
599,PF6434,Y,8411202,A,G,R1b1a1b,,,0,0,1,0,0,1
600,PF6438,Y,9464078,C,T,R1b1a1b,,,0,0,0,2,0,2
601,PF6527,Y,24394612,G,A,R1b1a1b,,,1,0,0,0,0,1
602,PF6525,Y,23476936,G,T,R1b1a1b,,,0,0,0,2,0,2
603,L1351,Y,24444622,C,T,R1b1a1b,PF6528;_YSC0000240,,0,0,0,4,0,4
604,PF6430,Y,8070532,T,A,R1b1a1b,,,2,0,0,0,0,2


In [216]:
s = "J2a1a1a"
mismatch_path(s, df_ch)

Mismatches: 2 / 9


Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
101,CTS585,Y,6864895,G,C,J2a,,,0,1,0,0,0,1
280,PF4953,Y,7680253,C,G,J2a,,,0,1,0,0,1,0
414,F4095,Y,8351025,G,A,J2a,PF4897,,1,0,0,0,0,1
573,PF5125,Y,9089648,C,G,J2a1a1a,,,0,0,1,0,0,1
869,PF4908,Y,14969634,T,G,J2,M172;_Page28,,0,0,1,0,0,1
1282,PF4568,Y,17637446,T,C,J,CTS7738,,0,0,0,1,1,0
1588,PF5105,Y,20836109,A,G,J2a1,,,0,0,1,0,0,1
1643,PF4983,Y,21318263,T,A,J2a,,,1,0,0,0,0,1
1706,L559,Y,21674327,A,G,J2a,PF4986,,0,0,1,0,0,1


In [210]:
df_ch[df_ch["Subgroup Name"]=="J2a"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
101,CTS585,Y,6864895,G,C,J2a,,,0,1,0,0,0,1
280,PF4953,Y,7680253,C,G,J2a,,,0,1,0,0,1,0
414,F4095,Y,8351025,G,A,J2a,PF4897,,1,0,0,0,0,1
1643,PF4983,Y,21318263,T,A,J2a,,,1,0,0,0,0,1
1706,L559,Y,21674327,A,G,J2a,PF4986,,0,0,1,0,0,1


In [217]:
df_der[df_der["Subgroup Name"]=="R1b1a1b1a1a1c2f~"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#


# 1) Run Pulldown on Twist test

In [105]:
%%time

iid = "A54927"
#path_bam = f'/mnt/archgen/users/hringbauer/data/twist_test/dedup.q25l35/postdd.twist.c2/{iid}.bam'
path_bam = f'/mnt/archgen/users/hringbauer/data/twist_test/dedup.q25l35/postdd.1240k/{iid}.bam'
!samtools index $path_bam

df_ch, df_der = call_y_bam(df=df, 
                           path_bam=path_bam) #A55903 and A55904

Average Coverage: 0.2718x
#Sites covered: 11862/73148
#Derived Loci: 
412 / 11862 covered>0
CPU times: user 100 ms, sys: 52.3 ms, total: 153 ms
Wall time: 3.82 s


In [None]:
df_der[-50:]

In [106]:
df_t = df_ch[df_ch["Subgroup Name"].str[0]=="G"]
refs = np.sum(df_t["ref#"])
alts = np.sum(df_t["alt#"])

frac = alts / (refs + alts)
print(f"Error Rate: {frac:.6f} ")

Error Rate: 0.015252 


# 2) Run Brienzi aDNA
Here: Use the Brienzi Y capture

In [12]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/brienziYcapture/A55903.bam") #A55903 and A55904

Average Coverage: 15.7615x
#Sites covered: 60398/73148
#Derived Loci: 
1049 / 60398 covered>0
CPU times: user 243 ms, sys: 59.2 ms, total: 302 ms
Wall time: 6.62 s


In [None]:
### Browse the results - update the output index to "browse". The derived SNPs at the end are the most interesting
df_der[-50:]

In [154]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/brienziYcapture/A55904.bam")

Average Coverage: 13.0144x
#Sites covered: 58128/73148
#Derived Loci: 
998 / 58128 covered>0
CPU times: user 210 ms, sys: 58.7 ms, total: 269 ms
Wall time: 5.79 s


In [None]:
df_der[-50:]

In [68]:
df_ch[df_ch["Subgroup Name"]=="G2a2b2a1a1c1a1"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
617,Z3388,Y,18396357,C,T,G2a2b2a1a1c1a1,,rs936457912,0,0,0,1,0,1


# Run Single Target Individual
Fill in the indivdiual of your choice

In [316]:
iid = "THE006"
bam_dict[iid]

'/mnt/archgen/Autorun_eager/eager_outputs/TF/THE/THE006/merged_bams/initial/THE006_ss_udgnone_libmerged_rmdup.bam'

In [None]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam=f'/mnt/archgen/Autorun_eager/eager_outputs/TF/DGB/{iid}/trimmed_bam/{iid}_ss.A0101.trimmed.bam',
                           path_temp='/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv')

In [83]:
df_der[-50:]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
0,M9140,Y,15514552,T,C,BT,,,0,1,0,0,0,1
1,PF328,Y,9158586,G,A,CT,,,1,0,0,0,0,1
2,M5656,Y,14207088,C,A,CT,,,1,0,0,0,0,1
3,M5736,Y,17894575,C,T,CT,CTS8243,,0,0,0,1,0,1
4,M5769,Y,19407727,C,G,CT,,,0,0,1,0,0,1
5,Z39188,Y,21440810,G,A,O1a1b,,,1,0,0,0,0,1


In [80]:
df_ch[df_ch["Subgroup Name"]=="G2a2b2a"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
51,PF3328,Y,6744902,C,T,G2a2b2a,CTS424,rs771484808,0,0,0,1,0,1
116,CTS946,Y,7100848,A,G,G2a2b2a,,rs761514061,0,0,1,0,0,1
1414,P303,Y,21645348,T,C,G2a2b2a,Page108;_PF3340;_S135;_Z765,rs72625365,0,1,0,0,0,1


In [24]:
df_ch[df_ch["Subgroup Name"].isin(["CT", "I","I1"])].sort_values(by="Subgroup Name")

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
96,M5601,Y,7533511,G,A,CT,,,1,0,0,0,0,1
218,M5633,Y,8543804,G,A,CT,,,1,0,0,0,0,1
297,Z17706,Y,9989244,G,T,CT,,,0,0,0,1,0,1
369,CTS2077,Y,14172842,T,G,CT,M5653,,0,0,1,0,0,1
460,L1028,Y,15615637,G,C,CT,CTS4368;_M5680,,0,1,0,0,0,1
597,M5728,Y,17750457,C,T,CT,CTS7933,,0,0,0,1,0,1
685,M5763,Y,19059200,C,A,CT,CTS9760,,1,0,0,0,0,1
735,PF3794,Y,21067903,C,T,I,,,0,0,0,1,0,1
356,CTS1805,Y,14076608,A,G,I1,Z2752,,0,0,1,0,0,1
650,Z2823,Y,18387563,G,T,I1,,,0,0,0,1,0,1


## Run Target .bam

In [51]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/skourtanioti/MHHAM_varia/Twist_eval_0723/eager_output/eager_output/trimmed_bam/APO037.A0102_2round.trimmed.bam",
                           path_temp='/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv')

Average Coverage: 0.0804x
#Sites covered: 5300/73148
#Derived Loci: 
209 / 5300 covered>0
CPU times: user 70.3 ms, sys: 12.5 ms, total: 82.8 ms
Wall time: 1.1 s


In [58]:
dft = df_ch[df_ch["Subgroup Name"].str.contains("I2")]

der = np.sum(dft["alt#"])
ref = np.sum(dft["ref#"])
print((ref,der))
print(f"Error rate: {der/(ref+der)*100:.4g}%")

(454, 12)
Error rate: 2.575%


In [None]:
df_ch[df_ch["Subgroup Name"]=="G2a2b2a1"]

In [None]:
df_der[-100:-50]

In [42]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/skourtanioti/MHHAM_varia/Twist_eval_0723/eager_output/eager_output/trimmed_bam/NEV014.A0301_2round.trimmed.bam",
                           path_temp='/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv')

Average Coverage: 0.0006x
#Sites covered: 37/73148
#Derived Loci: 
2 / 37 covered>0
CPU times: user 51.9 ms, sys: 21.1 ms, total: 73 ms
Wall time: 1.18 s


In [82]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/skourtanioti/MHHAM_varia/Twist_eval_0723/eager_output/eager_output/trimmed_bam/THE006.A0202_2round.trimmed.bam",
                           path_temp='/mnt/archgen/users/hringbauer/git/y_chrom/temp/temp.tsv')

Average Coverage: 0.1220x
#Sites covered: 7999/73148
#Derived Loci: 
215 / 7999 covered>0
CPU times: user 91 ms, sys: 0 ns, total: 91 ms
Wall time: 1.2 s


In [100]:
df_der.to_csv("/mnt/archgen/users/hringbauer/for_nada/y_snps_derived_twist_THE006.tsv", sep="\t")
df_ch.to_csv("/mnt/archgen/users/hringbauer/for_nada/y_snps_all_twist_THE006.tsv", sep="\t")

In [96]:
dft = df_ch[df_ch["Subgroup Name"]=="J1a2a1a2~"]
dft

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
1760,Z2346,Y,8416786,G,A,J1a2a1a2~,PF4663,,1,0,0,0,0,1
2757,PF4839,Y,14045838,C,T,J1a2a1a2~,AM01305;_YSC0000164,,0,0,0,1,0,1
3809,YSC0000181,Y,15756763,T,C,J1a2a1a2~,AM01313;_PF4840,,0,1,0,0,0,1
4237,AM01319,Y,16483304,G,A,J1a2a1a2~,PF4812;_YSC0000188;_Z2362,,1,0,0,0,0,1


In [39]:
dft = df_ch[df_ch["Subgroup Name"].str.contains("E")]

der = np.sum(dft["alt#"])
ref = np.sum(dft["ref#"])
print((ref,der))
print(f"Error rate: {der/(ref+der)*100:.4g}%")

(945, 11)
Error rate: 1.151%


In [None]:
idx = dft["alt#"]>0
dft[idx]

### Run Koba indivdiuals (Review)

In [44]:
nbrs = [2,7,8,9,11,40]  ### The numbers of Koba Individuals
files = [f"{n}al.bam" for n in nbrs]
folder = "/mnt/archgen/users/hringbauer/data/koba_review/"
paths_bam = [os.path.join(folder, f) for f in files]
paths_bam

['/mnt/archgen/users/hringbauer/data/koba_review/2al.bam',
 '/mnt/archgen/users/hringbauer/data/koba_review/7al.bam',
 '/mnt/archgen/users/hringbauer/data/koba_review/8al.bam',
 '/mnt/archgen/users/hringbauer/data/koba_review/9al.bam',
 '/mnt/archgen/users/hringbauer/data/koba_review/11al.bam',
 '/mnt/archgen/users/hringbauer/data/koba_review/40al.bam']

In [49]:
df = load_snp_file_ISOGG("./data/all_snps.csv")

Index(['Name', 'Subgroup Name', 'Alternate Names', 'rs numbers',
       'Build 37 Number', 'Build 38 Number', 'Mutation Info'],
      dtype='object')
Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 73148


In [76]:
%%time
df_ch, df_der = call_y_bam(df=df,
                           path_bed='/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps_chrY.bed',
                           path_bam=paths_bam[2])

Changing 592 ChrY -> Y
Average Coverage: 0.0088x
#Sites covered: 592/73148
#Derived Loci: 
11 / 592 covered>0
CPU times: user 62.4 ms, sys: 29.3 ms, total: 91.7 ms
Wall time: 942 ms


In [77]:
df_der[-50:]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
0,A8852,Y,22996423,T,A,A000-T,,,1,0,0,0,0,1
1,Y8300,Y,23446521,A,G,A1b,,rs764791635,0,0,1,0,0,1
2,M9151,Y,15794075,C,A,BT,,,1,0,0,0,0,1
3,M9188,Y,16946901,G,A,BT,,,1,0,0,0,0,1
4,M9223,Y,17623760,G,A,BT,,,1,0,0,0,0,1
5,M9327,Y,21492793,T,A,BT,,,1,0,0,0,0,1
6,M5631,Y,8396636,G,A,CT,PF292,,1,0,0,0,0,1
7,PF1551,Y,8430640,G,A,E1b1~,M5415,,1,0,0,0,0,1
8,BY165067,Y,16586488,G,C,G2a2b2a1a1b1a1a2a1b2a3a2,,,0,1,0,0,0,1
9,S6601,Y,8839295,G,A,I2a2,,,1,0,0,0,0,1


### Test Malta Indivdiual

In [31]:
%%time
df_ch, df_der = call_y_bam(df=df, 
                           path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam")

Average Coverage: 0.6477x
#Sites covered: 33545/73148
#Derived Loci: 
539 / 33545 covered>0
CPU times: user 81.5 ms, sys: 42.8 ms, total: 124 ms
Wall time: 1.42 s


In [32]:
df_ch

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
0,FGC17344,Y,2650033,G,A,R1b1a1b1a1a1c2b2a1b1c,,,0,0,3,0,3,0
1,MF2464,Y,2650045,A,G,O2a1b1a1a1a1e2a2b,,,3,0,0,0,3,0
2,Z21583^,Y,2650102,C,A,E1b1a1a1a2a2a2,,,0,1,0,0,1,0
3,Z57,Y,2650701,G,A,R1b1a1b1a1a2b1c1a1a,S1468,,0,0,1,0,1,0
4,Y33228,Y,2650709,C,T,G2a2b1b1a2,,,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33625,Z16860,Y,28801992,G,A,R1b1a1b1a1a2c1a2a3a,,,0,0,1,0,1,0
33626,A13718.2^^,Y,28804165,C,A,G2a2b2a1a1b1a1a2a1a1b1a1~,,rs201730753,0,1,0,0,1,0
33627,ZW09,Y,28804948,A,T,R1b1a1b1a1a2c1a2b1a1a2,,,1,0,0,0,1,0
33628,Z16041,Y,28804953,G,A,E1b1a1a1a1c2b1,,,0,0,1,0,1,0


In [None]:
df_der["Subgroup Name"].str[:3].value_counts()

## Analysis: Browse the output Tables

In [None]:
df_der1[-150:-100]

In [None]:
df_t = df_ch1[df_ch1["Subgroup Name"]=="G2a1"].sort_values(by="Subgroup Name")[:40]
df_t

In [208]:
df_t[df_t["alt#"]>0]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
63388,P312,Y,22157311,C,A,R1b1a1b1a1a2,PF6547;_S116,,51,0,0,0,0,51
63951,BY188,Y,22474043,G,T,R1b1a1b1a1a2c1a1a1a1a1~,,,0,0,7,3,7,3
16892,PF1557.2,Y,8631875,C,A,R1b1a1b1a1a2c1a4b8~,,,1,6,0,0,6,1
38535,BY23092,Y,16635363,G,A,R1b1a1b1a1a2c1a5c3b1b2,,,1,0,13,0,13,1


In [269]:
df_ch[df_ch['Name']=="P312"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
51049,P312,Y,22157311,C,A,R1b1a1b1a1a2,PF6547;_S116,,82,0,0,0,0,82


In [29]:
df_ch[df_ch['Subgroup Name']=="R"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
18,PF5992,Y,2810583,A,G,R,CTS207;_M600,,0,0,1,0,0,1
378,CTS3622,Y,15078469,C,G,R,PF6037,,0,0,1,0,0,1
412,FGC1168,Y,15667208,G,C,R,,,0,1,0,0,0,1
548,CTS7876,Y,17722802,G,A,R,PF6052,,1,0,0,0,0,1


In [243]:
df_t.columns

Index(['Name', 'chrom', 'pos', 'ref', 'alt', 'Subgroup Name',
       'Alternate Names', 'rs numbers', 'A', 'C', 'G', 'T', 'ref#', 'alt#'],
      dtype='object')

### Extra: Test a Malta Individual as known case
According to publication: H2

In [172]:
%%time
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X5.bam", df=df)

Average Depth: 0.7174x
Derived Read Loci: 588/73148
CPU times: user 40.6 s, sys: 704 ms, total: 41.3 s
Wall time: 41.3 s


In [None]:
df_ch[idx_der].sort_values(by="Subgroup Name")[:50]

### Test One other Malta Indivdiual

In [161]:
df_ch, df_der = call_y_bam(path_bam="/mnt/archgen/users/hringbauer/data/malta/X9.bam")

Average Depth: 4.0776x
Derived Read Loci: 1208/73148


In [None]:
df_der[-100:-50]

# Run Berlin samples

In [308]:
%%time
iid = "BEP003"
df_ch, df_der = call_y_bam(df=df, 
                           path_bam=f"/mnt/archgen/MICROSCOPE/eager_outputs/2023-08-21-berlin/trimmed_bam/{iid}_ss.A0101.SG1.trimmed.bam")

Average Coverage: 0.0401x
#Sites covered: 2859/73148
#Derived Loci: 
83 / 2859 covered>0
CPU times: user 43.4 ms, sys: 35.8 ms, total: 79.2 ms
Wall time: 882 ms


In [311]:
25/2859

0.008744316194473592

In [309]:
df_der[-50:]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#
33,Z40382,Y,13862470,G,A,BT,,,1,0,0,0,0,1
34,Y8325,Y,23190598,A,T,BT,,,0,0,0,1,0,1
35,M9037,Y,8691275,A,G,BT,,,0,0,1,0,0,1
36,Z32608,Y,16730065,C,T,C1b1a2a,,,0,0,0,1,0,1
37,Y25230,Y,16518148,A,G,C1b2a1c,,rs367589417,0,0,1,0,0,1
38,Z30593,Y,23165402,G,A,C2a1a1a,,,1,0,0,0,0,1
39,Y4542,Y,8514733,G,A,C2a1a3,FGC16329,,1,0,0,0,0,1
40,BY122771,Y,18639201,G,A,C2a1b,ACT1978,,1,0,0,0,0,1
41,M5709,Y,16887784,G,C,CT,,,0,1,0,0,0,1
42,M5826,Y,24470911,C,A,CT,,,1,0,0,0,0,1


In [303]:
%%time
iid = "BEP016"
df_ch, df_der = call_y_bam(df=df, 
                           path_bam=f"/mnt/archgen/MICROSCOPE/eager_outputs/2023-08-21-berlin/trimmed_bam/{iid}_ss.A0101.SG1.trimmed.bam")

Average Coverage: 0.0000x
#Sites covered: 1/73148
#Derived Loci: 
1 / 1 covered>0
CPU times: user 51.3 ms, sys: 23.6 ms, total: 74.9 ms
Wall time: 1.02 s


In [None]:
df_der[-10:]

In [290]:
df_ch[df_ch["Subgroup Name"]=="R"]

Unnamed: 0,Name,chrom,pos,ref,alt,Subgroup Name,Alternate Names,rs numbers,A,C,G,T,ref#,alt#


# Area 51

In [13]:
!samtools view /mnt/archgen/users/hringbauer/data/brienzi_capture/A55903.bam 1:33000000-34000000 | wc -l

3052


In [None]:
path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed"
path_bed = path_bed
df = pd.read_csv(path_bed, sep="\t")

# Redo ISOGG .bed file

In [23]:
path_bed = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps.bed"
path_bed_new = "/mnt/archgen/users/hringbauer/git/y_chrom/data/isogg_snps_chrY.bed"

df = pd.read_csv(path_bed, sep= "\t", header=None)
df[0] = "chrY"
df.to_csv(path_bed_new, sep="\t", header=None, index=False)

In [None]:
df