# Notebook to create .csv with Density for 1240k SNPs
(for each cM, plot Density of SNPs)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
import socket
import os as os
import sys as sys
import multiprocessing as mp
import matplotlib.colors as cls
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### 1) Load Data

In [88]:
def load_snp_df(path_snp = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K.snp"):
    """Load SNP data from Eigenstrat File"""
    df_snp = pd.read_csv(path_snp, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]  # Set the Columns
    return df_snp

def create_count_df(df_snp, ch=1, step=0.01):
    """Step: Step Width in Morgan"""
    df_chr = df_snp[df_snp["chr"]==ch]
    if len(df_chr)==0:
        raise RuntimeWarning(f"No SNPs on Chromosome {ch} found")
    min_map, max_map = np.min(df_chr["map"]), np.max(df_chr["map"])
    intervalls = np.arange(min_map, max_map, step)
    counts, bins = np.histogram(df_chr["map"], bins=intervalls)
    df_counts = pd.DataFrame({"StartM":bins[:-1], "EndM":bins[1:], "counts":counts,"chr":ch})
    return df_counts

In [89]:
df_snp = load_snp_df()

### 2) Create SNP Histogram

In [90]:
df_counts = [create_count_df(df_snp, ch=ch) for ch in range(1,24)]
print(len(df_counts))
df_count = pd.concat(df_counts) # Produce one big dataframe

23


In [None]:
#df_count["chr"].value_counts()

### 3) Save it

In [91]:
df_count.to_csv("./Data/1000Genomes/Markers/1240k/snp_density.csv", index=False, sep="\t")

# Calcualte total Map Length of Genome accessible to HAPSBURG:

In [73]:
df_snp = load_snp_df()

tot_lengthM = np.zeros(22)

for ch in range(1,23):
    df_chr = df_snp[df_snp["chr"]==ch]
    min_map, max_map = np.min(df_chr["map"]), np.max(df_chr["map"])
    tot_lengthM[ch-1] = max_map - min_map

In [83]:
tot_len_cM = np.sum(tot_lengthM) * 100
tot_len_cM

3539.3964000000005

In [84]:
### For a full sib:
tot_len_cM / 4

884.8491000000001

In [85]:
### For a full cousin:
tot_len_cM / 16

221.21227500000003

In [86]:
### For a second cousin:
tot_len_cM / 64

55.30306875000001

# Area 51

In [68]:
df_test = pd.read_csv("./Data/1000Genomes/Markers/1240k/snp_density.csv", sep="\t")

In [87]:
df_test

Unnamed: 0,beginM,endM,counts,chr
0,0.020130,0.030130,69,1
1,0.030130,0.040130,94,1
2,0.040130,0.050130,157,1
3,0.050130,0.060130,212,1
4,0.060130,0.070130,141,1
5,0.070130,0.080130,108,1
6,0.080130,0.090130,161,1
7,0.090130,0.100130,215,1
8,0.100130,0.110130,296,1
9,0.110130,0.120130,284,1
