In [136]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-166.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


### Load all ISOGG Y SNPs

In [143]:
path_snps = "./data/all_snps.csv"
col_pos = 'Build 37 Number'

def create_df_y_all(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number'):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs"""
    df_raw = pd.read_csv(path_snps)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["map"] = 0
    df["chrom"] = 24

    cols = ["Name", "chrom", "map", "pos", "ref", "alt"]
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    idx_dup = df.duplicated(subset="pos", keep="first")
    df = df[~idx_dup]
    print(f"# Unique SNP positions: {len(df)}")
    return df


def save_eigenstrat(df_save, path_save = "./data/eigenstrat/y_snps_all2020.snp"):
    """Save Eigenstrat Dataframe"""
    assert(len(df_save.columns)==6)
    df_save.to_csv(path_save, sep=" ", header=None, index=False)
    print(f"Saved {len(df_save)} Y SNPs to {path_save}")

In [171]:
%%time
df_save = create_df_y_all(path_snps = "./data/all_snps.csv", col_pos = 'Build 37 Number')

Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 72951
CPU times: user 1.46 s, sys: 13.5 ms, total: 1.47 s
Wall time: 1.47 s


### Save Eigenstrat

In [172]:
save_eigenstrat(df_save, path_save = "./data/eigenstrat/y_snps_all2020.snp")

Saved 72951 Y SNPs to ./data/eigenstrat/y_snps_all2020.snp


# Area 51

### Load typical SNP file

In [173]:
path_snp = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic0.v43.snp"
df_snp0 = pd.read_csv(path_snp, delim_whitespace=True, header=None)
df_snp0.columns = ["snp", "chr", "map", "pos", "ref", "alt"]

In [174]:
df_snp0.tail(2)

Unnamed: 0,snp,chr,map,pos,ref,alt
1233011,rs55686319,24,0.0,59033139,T,C
1233012,rs75089321,24,0.0,59033249,G,T


### Load typical Eigenstrat File

In [175]:
path_snp = "/n/groups/reich/hringbauer/git/y_chrom/data/eigenstrat/y_snps_all2020.snp"
df_snp = pd.read_csv(path_snp, delim_whitespace=True, header=None)
df_snp.columns = ["snp", "chr", "map", "pos", "ref", "alt"]

In [176]:
idx = df_snp["snp"].str.contains("Z18345")
df_snp[idx]

Unnamed: 0,snp,chr,map,pos,ref,alt


In [177]:
len(df_snp)

72951

In [178]:
df_merge = pd.merge(df_snp, df_snp0, on="pos")

In [179]:
len(df_merge)

15302