In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import socket as socket
import os as os
import sys as sys
import multiprocessing as mp
import h5py
import allel

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

compute-e-16-231.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 28


In [4]:
path_ali_vcf = "/n/groups/reich/ali/WholeGenomeImputation/imputed/v43.4/chr3.bcf"
path_ali_stats = "/n/groups/reich/ali/chromosome_abnormality/coverage_stats_v43.4.tsv"

### Read Ali's coverage statistics

In [51]:
df = pd.read_csv("/n/groups/reich/ali/chromosome_abnormality/coverage_stats_v43.4.tsv", sep="\t")

### Downsample to 1240K data

In [19]:
def save_1240kmarkers(snp1240k_path="", marker_path="", ch=0):
    """Save all 1240 Markers of .snp eigenstrat file.
    to marker_path.
    ch: Chromosome. If null filter all of them"""
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    if ch>0:
        df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    df_save = df_snp[["chr", "pos"]]
    df_save.to_csv(marker_path, sep="\t", header=None, index=False)
    print(f"Saved {len(df_save)} 1240k Markers on Chr. {ch} to {marker_path}")
    
def bctools_filter_vcf(in_vcf_path="", out_vcf_path="", marker_path=""):
    """Same as PLINK, but with bcftools and directly via Marker Positions.
    filter_iids: Whether to use the .csv with Indivdiduals to extract"""
    !bcftools view -Oz -o $out_vcf_path -T $marker_path -m2 -M2 -v snps $in_vcf_path
    print("Finished BCF tools filtering.")

## Run through one chromosome: vcf->1240K vcf -> hdf5
Mainly to test timing, and prepare what is about to follow

In [21]:
%%time
### Same but for HO SNPs
save_1240kmarkers(snp1240k_path="/n/groups/reich/DAVID/V43/V43.5/v43.5_HO.snp",
                  marker_path="./data/filters/ho_snps_bcftools_ch3.csv",
                  ch=3)

Loaded 43912 Chr.3 SNPs.
Saved 43912 1240k Markers on Chr. 3 to ./data/filters/ho_snps_bcftools_ch3.csv
CPU times: user 4.62 s, sys: 158 ms, total: 4.78 s
Wall time: 4.67 s


In [None]:
%%time
bctools_filter_vcf(in_vcf_path = "/n/groups/reich/ali/WholeGenomeImputation/imputed/v43.4/chr3.bcf",
                   out_vcf_path= "./data/vcf/1240k_v43/ch3.vcf.gz",
                   marker_path = "./data/filters/ho_snps_bcftools_ch3.csv")

In [None]:
print(f"Run finished")

### Convert VCF to HDF5

In [2]:
%%time
path_vcf = "./data/vcf/1240k_v43/ch3.vcf.gz"
path_h5 = "./data/hdf5/1240k_v43/ch3.h5"

allel.vcf_to_hdf5(input=path_vcf, output=path_h5, 
                  fields = ['variants/*', 'calldata/*', "samples"], compression="gzip") # Do the conversion to hdf5. Takes hours
print("Finished!")

Finished!
CPU times: user 13min 18s, sys: 1min 29s, total: 14min 48s
Wall time: 14min 48s


In [3]:
print("Finished transformatino to hdf5")

Finished transformatino to hdf5


# Area 51
Test code here.

### Test vcf

In [None]:
!bcftools view $path_ali_vcf | head -200

### Test hdf5

In [4]:
f = h5py.File("./data/hdf5/1240k_v43/ch3.h5", "r") # Load for Sanity Check. See below!

In [5]:
list(f)

['calldata', 'samples', 'variants']

In [8]:
list(f["calldata"])

['AD', 'DS', 'GP', 'GT', 'HS', 'PL']

In [9]:
list(f["variants"])

['AF',
 'ALT',
 'BUF',
 'CHROM',
 'FILTER_PASS',
 'ID',
 'INFO',
 'POS',
 'QUAL',
 'RAF',
 'REF',
 'altlen',
 'is_snp',
 'numalt']

In [76]:
samples = pd.Series(f["samples"][:])
samples[samples.str.contains("MA89")]

12483    MA89
dtype: object

In [79]:
snps = range(30000,30200)
j = 12483
ads = f["calldata/AD"][snps, j, :2]
gts = f["calldata/GT"][snps, j, :]
gp = f["calldata/GP"][snps, j, :]
df = pd.DataFrame({"ref":ads[:,0], "alt":ads[:,1], "gt0":np.sum(gts, axis=1)})

In [None]:
gp

In [None]:
df[0:50]

# Plot Allele Frequencies