In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/y_chrom/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-17-122.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/y_chrom
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


### Load all ISOGG Y SNPs

In [2]:
path_snps = "./data/all_snps.csv"
col_pos = 'Build 37 Number'

def create_df_y_all(path_snps = "./data/all_snps.csv", 
                    col_pos = 'Build 37 Number'):
    """Return Dataframe in Eigenstrat Format,
    filtered for biallelic SNPs"""
    df_raw = pd.read_csv(path_snps)
    print(f"Loaded {len(df_raw)} SNPs")

    ### Process the positions
    pos = df_raw[col_pos]
    df_raw["pos"] = pd.to_numeric(pos, errors="coerce")

    idx = ~df_raw["pos"].isna()
    print(f"# Positions available: {np.sum(idx)}")
    df = df_raw[idx].reset_index(drop=True)
    df["pos"]=df["pos"].astype("int")

    idx_bi= (df["Mutation Info"].str.len()==4)
    print(f"# Biallelic SNPs: {np.sum(idx_bi)}")
    df = df[idx_bi].reset_index(drop=True)
    df["ref"] = df["Mutation Info"].str[0]
    df["alt"] = df["Mutation Info"].str[3]
    df["map"] = 0
    df["chrom"] = 24

    cols = ["Name", "chrom", "map", "pos", "ref", "alt"]
    df = df[cols]
    df = df.replace(regex=[' ','\n'], value='_')
    ### Sort by position
    df = df.sort_values(by="pos")
    
    ### Keep only SNPs where Ref and Alt Different
    idx_same = (df["ref"]==df["alt"])
    df = df[~idx_same]
    print(f"# Ref & Alt different: {len(df)}")
    
    ### Keep only ACTG
    snps_acceptable = ["A", "C", "T", "G"]
    idx_ref = df["ref"].isin(snps_acceptable)
    idx_alt = df["alt"].isin(snps_acceptable)
    idx_both = idx_ref & idx_alt
    df = df[idx_both]
    print(f"# Ref & Alt ACTG: {len(df)}")
    
    ### Keep Unique Values
    idx_dup = df.duplicated(subset="pos", keep="first")
    df = df[~idx_dup]
    print(f"# Unique SNP positions: {len(df)}")
    
    ### Remove duplicate Names
    idx_dup = df.duplicated(subset="Name", keep=False)
    df = df[~idx_dup]
    print(f"# Unique Names: {len(df)}")
    return df


def save_eigenstrat(df_save, path_save = "./data/eigenstrat/y_snps_all2020.snp"):
    """Save Eigenstrat Dataframe"""
    assert(len(df_save.columns)==6)
    df_save.to_csv(path_save, sep=" ", header=None, index=False)
    print(f"Saved {len(df_save)} Y SNPs to {path_save}")

In [3]:
%%time
df_save = create_df_y_all(path_snps = "./data/all_snps.csv", col_pos = 'Build 37 Number')

Loaded 92035 SNPs
# Positions available: 91881
# Biallelic SNPs: 91814
# Ref & Alt different: 91811
# Ref & Alt ACTG: 91806
# Unique SNP positions: 72951
# Unique Names: 72933
CPU times: user 1.55 s, sys: 54.3 ms, total: 1.61 s
Wall time: 1.6 s


### Save Eigenstrat

In [25]:
save_eigenstrat(df_save, path_save = "./data/eigenstrat/y_snps_all2020.snp")

Saved 72933 Y SNPs to ./data/eigenstrat/y_snps_all2020.snp


# Add in in the special O SNPs

In [29]:
def load_om117(path):
    """Return Dataframe with O-M117 Data"""
    df_t = pd.read_csv(path, index_col=False, sep=",")
    l = len(df_t)
    df_t = df_t.rename(columns = {'POS':'pos'})  # For merging
    df_t = df_t[pd.to_numeric(df_t['pos'], errors='coerce').notnull()]
    df_t['pos'] = df_t['pos'].apply(pd.to_numeric)
    print("Drop from %i to %i numerical pos rows" % (l,len(df_t)))
    return df_t

def create_snp_df(df):
    """Creates a eigenstrat SNP dataframe
    from existing Y dataframe (formatted)"""    
    dct = {"Name":df["SNP"].values,
          "chrom":24,
          "map":0,
          "pos":df["pos"].values,
          "ref":df["REF"].values,
          "alt":df["ALT"].values}
    df1 = pd.DataFrame(dct)
    return df1

def merge_es_ds(df1,df2):
    """Merge two eigenstrat dataframes.
    Return, sorted (by pos), unique (pos) dataframe"""
    df_m = pd.concat([df1, df2])
    l=len(df_m)
    df_m = df_m.drop_duplicates('pos').reset_index(drop=True)
    print(f"Merged to {l} rows.\n{len(df_m)} unique row")
    df_m = df_m.sort_values(by="pos") # Sort by position
    return df_m

In [30]:
### Load O SNPs
df117 = load_om117("../tibet_aDNA/Data/o2-m117SNPs.csv")
df117_es = create_snp_df(df117) # Transform into eigenstrat format
#pd.merge(df117_es, df_save, on="pos") # For sanity check (check REF/ALTs)

Drop from 984 to 984 numerical pos rows


In [42]:
df_save2 = merge_es_ds(df_save, df117_es)
df_save2 = df_save2.replace(np.nan, 'NAN', regex=True)

Merged to 73917 rows.
73680 unique row


In [44]:
### Save the full SNP dataframe
save_eigenstrat(df_save2, path_save = "./data/eigenstrat/y_snps_all2020_vO.snp")

Saved 73680 Y SNPs to ./data/eigenstrat/y_snps_all2020_vO.snp


# Area 51

### Load typical SNP file

In [34]:
path_snp = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic0.v43.snp"
df_snp0 = pd.read_csv(path_snp, delim_whitespace=True, header=None)
df_snp0.columns = ["snp", "chr", "map", "pos", "ref", "alt"]
df_1240k = df_snp0[df_snp0["chr"]==24]

In [36]:
df_1240k.head(2)

Unnamed: 0,snp,chr,map,pos,ref,alt
1200343,M288,24,0.0,2649694,G,T
1200344,M236,24,0.0,2649696,C,G


### Load File Produce here

In [37]:
path_snp = "/n/groups/reich/hringbauer/git/y_chrom/data/eigenstrat/y_snps_all2020_vO.snp"
df_snp = pd.read_csv(path_snp, delim_whitespace=True, header=None)
df_snp.columns = ["snp", "chr", "map", "pos", "ref", "alt"]

In [38]:
df_snp

Unnamed: 0,snp,chr,map,pos,ref,alt
0,M288,24,0,2649694,G,T
1,M236,24,0,2649696,C,G
2,FGC17344,24,0,2650033,G,A
3,MF2464,24,0,2650045,A,G
4,Z21583^,24,0,2650102,C,A
...,...,...,...,...,...,...
73675,BY151778^^,24,0,28804842,G,T
73676,ZW09,24,0,28804948,A,T
73677,Z16041,24,0,28804953,G,A
73678,FT213868^^,24,0,28807842,T,C


In [14]:
idx = df_snp["snp"].str.contains("A10158")
df_snp[idx]

Unnamed: 0,snp,chr,map,pos,ref,alt
18477,A10158,24,0,8891051,C,T


In [27]:
len(df_snp)

72933

In [178]:
df_merge = pd.merge(df_snp, df_snp0, on="pos")

In [179]:
len(df_merge)

15302