# Prepare SNP sets for GEDmatch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import h5py
import matplotlib.cm as cm
import itertools as it
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)

    
if socket_name.startswith("bionc"):
    print("Leipzig Cluster detected!")
    path = "/mnt/archgen/users/hringbauer/brienzi/"
    #sys.path.append("/mnt/archgen/users/hringbauer/data/malta/") 
    
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

bionc21
Leipzig Cluster detected!
/mnt/archgen/users/hringbauer/brienzi
CPU Count: 40


In [74]:
def extract_23andme_snps(path_23, savepath=""):
    """Extract 23andme SNPs and save .tsv.
    path_23: 23andme .txt file
    savepath: Path to save output .tsv to"""
    df = pd.read_csv(path_23, comment='#', sep="\t", header=None, low_memory=False)
    df.columns = ["rsid","chr","pos", "gt"]
    if len(savepath)>0:
        df.to_csv(savepath, sep="\t", index=False)
        print(f"Saved {len(df)} SNPs to: {savepath}")
    return df

def create_23andme_snpfile(path_23, save_folder="", chs=range(1,23)):
    """Create bcftools_ch files for 23andme SNPs extracted
    from 23andme file.
    path_23: 23andme file
    save_folder: Where to save output to"""
    dft = extract_23andme_snps(path_23 = path_23, savepath ="")

    for ch in chs:
        df_ch = dft[dft["chr"]==str(ch)]
        df_save = df_ch[["chr","pos"]].astype("int32")

        savepath = os.path.join(save_folder, f"bcftools_ch{ch}.csv")
        df_save.to_csv(savepath, sep="\t", header=None, index=False)
        print(f"Saved {len(df_save)} SNPs to: {savepath}")

### 1) Load 23andme data

In [18]:
extract_23andme_snps(path_23 = "/mnt/archgen/users/hringbauer/git/gedmatch_prep/data/genome_Harald_Ringbauer_v4_Full_20181111230705.txt",
                     savepath ="/mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/snps.tsv")

Saved 601895 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/snps.tsv


In [21]:
df[df["chr"]=="1"]

Unnamed: 0,rsid,chr,pos,gt
0,rs12564807,1,734462,AA
1,rs3131972,1,752721,AG
2,rs148828841,1,760998,CC
3,rs12124819,1,776546,AA
4,rs115093905,1,787173,GG
...,...,...,...,...
46657,rs4359061,1,249174682,GG
46658,rs6587762,1,249198164,GG
46659,rs7527472,1,249202567,CC
46660,rs6704311,1,249210707,GG


# 2) Check 23andme .bed files
Specifically, for intersection with 1240k files
(to check whether indexing is same)

In [32]:
### Look how bcftools SNP file looks like
ch = 1
marker_path = f"/mnt/archgen/users/hringbauer/data/1240kSNPs/bcftools_ch{ch}.csv"
dft = pd.read_csv(marker_path, sep="\t", header=None)
dft.columns = ["chr", "pos"]

In [48]:
dft

Unnamed: 0,chr,pos
0,1,752566
1,1,776546
2,1,832918
3,1,842013
4,1,846864
...,...,...
93161,1,249179856
93162,1,249188627
93163,1,249198164
93164,1,249202567


In [45]:
df_ch = df[df["chr"]==str(ch)]
dfm = pd.merge(df_ch, dft, on=["pos"])
print(f"Merged to {len(dfm)}/ {len(df_ch)}(23andme) & {len(dft)} (1240k) SNPs")

Merged to 34517/ 46662(23andme) & 93166 (1240k) SNPs


# 3) Create 23andme SNP files

In [78]:
%%time

create_23andme_snpfile(path_23 = "/mnt/archgen/users/hringbauer/git/gedmatch_prep/data/genome_Harald_Ringbauer_v4_Full_20181111230705.txt",
                       save_folder = "/mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme")

Saved 46662 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch1.csv
Saved 46128 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch2.csv
Saved 38517 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch3.csv
Saved 33915 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch4.csv
Saved 34387 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch5.csv
Saved 40384 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch6.csv
Saved 33053 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch7.csv
Saved 30268 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch8.csv
Saved 26586 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch9.csv
Saved 29210 SNPs to: /mnt/archgen/users/hringbauer/git/gedmatch_prep/data/23andme/bcftools_ch10.csv
Saved 293

# Area 51

## Explore 23andme File

In [79]:
dft = extract_23andme_snps(path_23 = "/mnt/archgen/users/hringbauer/git/gedmatch_prep/data/genome_Harald_Ringbauer_v4_Full_20181111230705.txt")

In [81]:
dft["chr"].value_counts()

chr
1     46662
2     46128
6     40384
3     38517
5     34387
4     33915
7     33053
8     30268
11    29328
10    29210
12    28451
9     26586
13    21679
X     19478
16    19199
17    18743
14    18696
15    18281
18    16490
20    14494
19    13002
22     9098
21     8461
MT     5083
Y      2302
Name: count, dtype: int64

In [82]:
dft

Unnamed: 0,rsid,chr,pos,gt
0,rs12564807,1,734462,AA
1,rs3131972,1,752721,AG
2,rs148828841,1,760998,CC
3,rs12124819,1,776546,AA
4,rs115093905,1,787173,GG
...,...,...,...,...
601890,i4000757,MT,16526,G
601891,i701671,MT,16526,G
601892,i4990307,MT,16527,C
601893,i4000756,MT,16540,C
