In [5]:
import os
import numpy as np
import pyBigWig
from collections import defaultdict
import matplotlib.pyplot as plt

In [6]:
proj_root = "/users/kcochran/projects/procapnet/"

which_genome = "hg38"

chroms_to_look_at = ["chr" + str(i+1) for i in range(22)] + ["chrX", "chrY"]

cell_types = ["K562", "A673", "CACO2", "CALU3", "HUVEC", "MCF10A"]

chrom_sizes_filepath = which_genome + "/" + which_genome + ".chrom.sizes"

In [7]:
def load_chrom_sizes(chrom_sizes_filepath):
    with open(chrom_sizes_filepath) as f:
        chrom_sizes_lines = [line.strip().split('\t') for line in f]
    chrom_sizes = {line[0] : int(line[1]) for line in chrom_sizes_lines}
    return chrom_sizes

chrom_sizes = load_chrom_sizes(chrom_sizes_filepath)

In [8]:
def get_bw_path(cell_type, pos_or_neg, which_genome = which_genome, proj_root=proj_root):
    assert pos_or_neg in ["pos", "neg"], pos_or_neg
    bws_dir = "bigwigs/" + which_genome + "/" + cell_type + "/genomewide/"
    bw_path = bws_dir + ".".join([cell_type, pos_or_neg, "bigWig"])
    assert os.path.exists(bw_path), bw_path
    return bw_path

In [9]:
# (this cell is old; the last cell in this notebook is the current version)
# (just keeping this so I remember what used to be broken)

for cell_type in cell_types:
    for strand in ["pos", "neg"]:
        print("\n")
        print(cell_type, strand)
        
        bw_path = get_bw_path(cell_type, strand)
        print("BigWig path:", bw_path)
        print("Exists?", os.path.exists(bw_path))
        
        # for hg38, this is about 24 GB
        print("File size:", "%0.1f" % (os.path.getsize(bw_path) / 1000000000) + "GB")
        
        try:
            bw = pyBigWig.open(bw_path, "r")
            
            a = bw.values("chr1", 103525338, 103525341, numpy=True)
            print("Some chr1 values:", a)
            
            bw.close()
        except:
            print("error")
            
        try:
            bw = pyBigWig.open(bw_path, "r")
            
            a = bw.values("chr17", 1000000, 1000003, numpy=True)
            print("Some chr17 values:", a)
            
            bw.close()
        except:
            print("error")



K562 pos
BigWig path: bigwigs/hg38/K562/genomewide/K562.pos.bigWig
Exists? True
File size: 1.5GB
error
Some chr17 values: [nan nan nan]


K562 neg
BigWig path: bigwigs/hg38/K562/genomewide/K562.neg.bigWig
Exists? True
File size: 23.9GB
Some chr1 values: [0.07293156 0.4248827  0.12350942]
Some chr17 values: [0.00693997 0.01924002 0.00384026]


A673 pos
BigWig path: bigwigs/hg38/A673/genomewide/A673.pos.bigWig
Exists? True
File size: 24.0GB
error
error


A673 neg
BigWig path: bigwigs/hg38/A673/genomewide/A673.neg.bigWig
Exists? True
File size: 24.0GB
Some chr1 values: [0.06382322 0.4426282  0.11472011]
Some chr17 values: [0.0115502  0.04067886 0.00563827]


CACO2 pos
BigWig path: bigwigs/hg38/CACO2/genomewide/CACO2.pos.bigWig
Exists? True
File size: 24.1GB
Some chr1 values: [3.4407833e-05 6.6062939e-06 1.9196763e-04]
Some chr17 values: [0.00238057 0.00151144 0.02050897]


CACO2 neg
BigWig path: bigwigs/hg38/CACO2/genomewide/CACO2.neg.bigWig
Exists? True
File size: 24.1GB
Some chr1 valu

[bwGetOverlappingIntervalsCore] Got an error
[readRTreeIdx] Mismatch in the magic number!
[bwOpen] bwg->idx is NULL bwg->hdr->dataOffset 0x2b4!
[pyBwOpen] bw is NULL!
[readRTreeIdx] Mismatch in the magic number!
[bwOpen] bwg->idx is NULL bwg->hdr->dataOffset 0x2b4!
[pyBwOpen] bw is NULL!


In [None]:
# broken: K562 pos, A673 pos

In [None]:
# were any of the individual chromosome bigwigs broken? (for hg38)

In [10]:
def get_chrom_bw_path(cell_type, pos_or_neg, chrom, which_genome = which_genome):
    assert pos_or_neg in ["pos", "neg"], pos_or_neg
    bws_dir = "/".join(["bigwigs", which_genome, cell_type, chrom]) + "/"
    bw_path = bws_dir + ".".join([chrom, cell_type, strand, "bigWig"])
    assert os.path.exists(bw_path), bw_path
    return bw_path

In [11]:
for cell_type in cell_types:
    for strand in ["pos", "neg"]:
        for chrom in chroms_to_look_at:
            print("\n")
            print(cell_type, strand, chrom)

            bw_path = get_chrom_bw_path(cell_type, strand, chrom)
            print("BigWig path:", bw_path)
            print("Exists?", os.path.exists(bw_path))

            # for hg38, this is about 24 GB
            print("File size:", "%0.1f" % (os.path.getsize(bw_path) / 1000000000) + "GB")

            try:
                bw = pyBigWig.open(bw_path, "r")

                a = bw.values(chrom, 0, chrom_sizes[chrom], numpy=True)
                    
                if np.mean(np.isnan(a)) > 0.25:
                    print("=================== NaNs!!!", np.mean(np.isnan(a)), chrom_sizes[chrom])

                if len(a) != chrom_sizes[chrom]:
                    print("=================== Wrong length????")
                    print(rand_start, len(a), chrom_sizes[chrom]) 

                bw.close()
            except:
                print("=================== error!!!")



K562 pos chr1
BigWig path: bigwigs/hg38/K562/chr1/chr1.K562.pos.bigWig
Exists? True
File size: 1.8GB


K562 pos chr2
BigWig path: bigwigs/hg38/K562/chr2/chr2.K562.pos.bigWig
Exists? True
File size: 1.9GB


K562 pos chr3
BigWig path: bigwigs/hg38/K562/chr3/chr3.K562.pos.bigWig
Exists? True
File size: 1.6GB


K562 pos chr4
BigWig path: bigwigs/hg38/K562/chr4/chr4.K562.pos.bigWig
Exists? True
File size: 1.5GB


K562 pos chr5
BigWig path: bigwigs/hg38/K562/chr5/chr5.K562.pos.bigWig
Exists? True
File size: 1.4GB


K562 pos chr6
BigWig path: bigwigs/hg38/K562/chr6/chr6.K562.pos.bigWig
Exists? True
File size: 1.4GB


K562 pos chr7
BigWig path: bigwigs/hg38/K562/chr7/chr7.K562.pos.bigWig
Exists? True
File size: 1.3GB


K562 pos chr8
BigWig path: bigwigs/hg38/K562/chr8/chr8.K562.pos.bigWig
Exists? True
File size: 1.2GB


K562 pos chr9
BigWig path: bigwigs/hg38/K562/chr9/chr9.K562.pos.bigWig
Exists? True
File size: 1.0GB


K562 pos chr10
BigWig path: bigwigs/hg38/K562/chr10/chr10.K562.pos.bigW



A673 neg chr6
BigWig path: bigwigs/hg38/A673/chr6/chr6.A673.neg.bigWig
Exists? True
File size: 1.4GB


A673 neg chr7
BigWig path: bigwigs/hg38/A673/chr7/chr7.A673.neg.bigWig
Exists? True
File size: 1.3GB


A673 neg chr8
BigWig path: bigwigs/hg38/A673/chr8/chr8.A673.neg.bigWig
Exists? True
File size: 1.2GB


A673 neg chr9
BigWig path: bigwigs/hg38/A673/chr9/chr9.A673.neg.bigWig
Exists? True
File size: 1.0GB


A673 neg chr10
BigWig path: bigwigs/hg38/A673/chr10/chr10.A673.neg.bigWig
Exists? True
File size: 1.1GB


A673 neg chr11
BigWig path: bigwigs/hg38/A673/chr11/chr11.A673.neg.bigWig
Exists? True
File size: 1.1GB


A673 neg chr12
BigWig path: bigwigs/hg38/A673/chr12/chr12.A673.neg.bigWig
Exists? True
File size: 1.1GB


A673 neg chr13
BigWig path: bigwigs/hg38/A673/chr13/chr13.A673.neg.bigWig
Exists? True
File size: 0.8GB


A673 neg chr14
BigWig path: bigwigs/hg38/A673/chr14/chr14.A673.neg.bigWig
Exists? True
File size: 0.7GB


A673 neg chr15
BigWig path: bigwigs/hg38/A673/chr15/chr1



CALU3 pos chr8
BigWig path: bigwigs/hg38/CALU3/chr8/chr8.CALU3.pos.bigWig
Exists? True
File size: 1.2GB


CALU3 pos chr9
BigWig path: bigwigs/hg38/CALU3/chr9/chr9.CALU3.pos.bigWig
Exists? True
File size: 1.0GB


CALU3 pos chr10
BigWig path: bigwigs/hg38/CALU3/chr10/chr10.CALU3.pos.bigWig
Exists? True
File size: 1.1GB


CALU3 pos chr11
BigWig path: bigwigs/hg38/CALU3/chr11/chr11.CALU3.pos.bigWig
Exists? True
File size: 1.1GB


CALU3 pos chr12
BigWig path: bigwigs/hg38/CALU3/chr12/chr12.CALU3.pos.bigWig
Exists? True
File size: 1.1GB


CALU3 pos chr13
BigWig path: bigwigs/hg38/CALU3/chr13/chr13.CALU3.pos.bigWig
Exists? True
File size: 0.8GB


CALU3 pos chr14
BigWig path: bigwigs/hg38/CALU3/chr14/chr14.CALU3.pos.bigWig
Exists? True
File size: 0.7GB


CALU3 pos chr15
BigWig path: bigwigs/hg38/CALU3/chr15/chr15.CALU3.pos.bigWig
Exists? True
File size: 0.7GB


CALU3 pos chr16
BigWig path: bigwigs/hg38/CALU3/chr16/chr16.CALU3.pos.bigWig
Exists? True
File size: 0.7GB


CALU3 pos chr17
BigWig 



HUVEC neg chr10
BigWig path: bigwigs/hg38/HUVEC/chr10/chr10.HUVEC.neg.bigWig
Exists? True
File size: 1.1GB


HUVEC neg chr11
BigWig path: bigwigs/hg38/HUVEC/chr11/chr11.HUVEC.neg.bigWig
Exists? True
File size: 1.1GB


HUVEC neg chr12
BigWig path: bigwigs/hg38/HUVEC/chr12/chr12.HUVEC.neg.bigWig
Exists? True
File size: 1.1GB


HUVEC neg chr13
BigWig path: bigwigs/hg38/HUVEC/chr13/chr13.HUVEC.neg.bigWig
Exists? True
File size: 0.8GB


HUVEC neg chr14
BigWig path: bigwigs/hg38/HUVEC/chr14/chr14.HUVEC.neg.bigWig
Exists? True
File size: 0.7GB


HUVEC neg chr15
BigWig path: bigwigs/hg38/HUVEC/chr15/chr15.HUVEC.neg.bigWig
Exists? True
File size: 0.7GB


HUVEC neg chr16
BigWig path: bigwigs/hg38/HUVEC/chr16/chr16.HUVEC.neg.bigWig
Exists? True
File size: 0.7GB


HUVEC neg chr17
BigWig path: bigwigs/hg38/HUVEC/chr17/chr17.HUVEC.neg.bigWig
Exists? True
File size: 0.7GB


HUVEC neg chr18
BigWig path: bigwigs/hg38/HUVEC/chr18/chr18.HUVEC.neg.bigWig
Exists? True
File size: 0.6GB


HUVEC neg chr19
B

In [None]:
# after re-doing the broken ones:

In [12]:
for cell_type in cell_types:
    for strand in ["pos", "neg"]:
        print("\n")
        print(cell_type, strand)
        
        bw_path = get_bw_path(cell_type, strand)
        print("BigWig path:", bw_path)
        print("Exists?", os.path.exists(bw_path))
        
        # for hg38, this is about 24 GB
        print("File size:", "%0.1f" % (os.path.getsize(bw_path) / 1000000000) + "GB")
        
        try:
            bw = pyBigWig.open(bw_path, "r")
            
            a = bw.values("chr1", 103525338, 103525341, numpy=True)
            print("Some chr1 values:", a)
            
            bw.close()
        except:
            print("error")
            
        try:
            bw = pyBigWig.open(bw_path, "r")
            
            a = bw.values("chr17", 1000000, 1000003, numpy=True)
            print("Some chr17 values:", a)
            
            bw.close()
        except:
            print("error")



K562 pos
BigWig path: bigwigs/hg38/K562/genomewide/K562.pos.bigWig
Exists? True
File size: 23.9GB
Some chr1 values: [4.3629756e-05 1.7069968e-05 2.3992317e-04]
Some chr17 values: [0.00444451 0.00309863 0.03139656]


K562 neg
BigWig path: bigwigs/hg38/K562/genomewide/K562.neg.bigWig
Exists? True
File size: 23.9GB
Some chr1 values: [0.07293156 0.4248827  0.12350942]
Some chr17 values: [0.00693997 0.01924002 0.00384026]


A673 pos
BigWig path: bigwigs/hg38/A673/genomewide/A673.pos.bigWig
Exists? True
File size: 24.0GB
Some chr1 values: [1.5455839e-04 8.7890243e-05 1.0999229e-03]
Some chr17 values: [0.00356467 0.00144409 0.01272652]


A673 neg
BigWig path: bigwigs/hg38/A673/genomewide/A673.neg.bigWig
Exists? True
File size: 24.0GB
Some chr1 values: [0.06382322 0.4426282  0.11472011]
Some chr17 values: [0.0115502  0.04067886 0.00563827]


CACO2 pos
BigWig path: bigwigs/hg38/CACO2/genomewide/CACO2.pos.bigWig
Exists? True
File size: 24.1GB
Some chr1 values: [3.4407833e-05 6.6062939e-06 1.91