In [1]:
# %load ../start.py
# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v

last updated: 2016-11-21 

CPython 3.5.2
IPython 5.1.0


Sequence gazing with Hisat2 and Magic alginment files has shown that there are some differences between the two, but these differences appear to be moderate. To look more closely at these differences I am taking the differences `Hisat2 - Magic` and creating a new track. A normalization schema is needed to compare these two file because overall the Hisat2 is aligning more reads. I have decided on a simple rank based quantile normalization. Basic logic is as follow:

For Hisat2 and Magic I have bigwig files which associate position with a count.

|position | Hisat2 | Magic|
|-------------------------|
|1        | 5      | 4    |
|2        | 2      | 1    |
|3        | 3      | 4    |
|4        | 4      | 2    |

For each chromosome determine the ranks of each value for Hisat2 and Magic separately.

|position | Hisat2 | Magic|
|-------------------------|
|1        | iv     | iii  |
|2        | i      | i    |
|3        | ii     | iii  |
|4        | iii    | ii   |

Then sort the Hisat2 values by its ranks and the Magic values by its ranks and average across rows.

|rank     | Hisat2 | Magic| Mean |
|--------------------------------|
|i        | 2      | 1    | 1.5  |
|ii       | 3      | 2    | 2.5  |
|iii      | 4      | 4    | 4    |
|iv       | 5      | 4    | 4.5  |

Replace the original values with its ranks corresponding Mean.

|position | Hisat2 | Magic|
|-------------------------|
|1        | 4.5    | 4    |
|2        | 1.5    | 1.5  |
|3        | 2.5    | 4    |
|4        | 4      | 2.5  |

Then take difference of `Hisat2 - Magic`

|position | Hisat2 | Magic| Diff |
|-------------------------|------|
|1        | 4.5    | 4    | 0.5  |
|2        | 1.5    | 1.5  | 0    |
|3        | 2.5    | 4    | -1.5 |
|4        | 4      | 2.5  | 1.5  |

New tracks can be found in a number of places:

**On DGS**

smb://niddkb8na2.niddk.nih.gov/LCDB_DGS/people/Fearjm/IGV_Stuff/diff_magic_zhenxia_318_golden_set_2016-06-14_plus.bw

smb://niddkb8na2.niddk.nih.gov/LCDB_DGS/people/Fearjm/IGV_Stuff/diff_magic_zhenxia_318_golden_set_2016-06-14_minus.bw

**Download BigWigs**

https://helix.nih.gov/~fearjm/ncbi_remap/dm6/diff_magic_zhenxia_318_golden_set_2016-06-14_plus.bw

https://helix.nih.gov/~fearjm/ncbi_remap/dm6/diff_magic_zhenxia_318_golden_set_2016-06-14_minus.bw


**View on UCSC**

http://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&hubUrl=https://helix.nih.gov/~fearjm/ncbi_remap/hub.txt

In [2]:
# Imports
import numpy as np
import scipy as sp
from scipy.stats import rankdata

import pyBigWig

In [3]:
def normalize(array1, array2):
    """ Quantile normalize two arrays and return the difference of array1 - array2
    
    Parameters
    ----------
    array1: numpy.array
        A genomic array where each position is a base on a chromosome from one bigwig file.
        
    array2: numpy.array
        A genomic array where each position is a base on a chromosome from one bigwig file.
        
    Returns
    -------
    numpy.array: Difference between array1 - array2
    
    Example
    -------
    >>> array1 = np.array([25, 10, 5, 20])
    >>> array2 = np.array([20, 25, 10, 40])
    >>> normalize(array1, array2)
    array([ 17,  -7,   0, -10])
    
    """
    # make sure the arrays are numpy arrays
    array1 = np.array(array1)
    array2 = np.array(array2)
    
    # Fill NaN's with 0's
    ## Many positions are not in the big wig file because we droped 0's
    array1[np.isnan(array1)] = 0
    array2[np.isnan(array2)] = 0
    
    # Make sure these are floats for latter
    array1 = array1.astype('float')
    array2 = array2.astype('float')
    
    # get ranks of each array
    rank1 = rankdata(array1, method='dense')
    rank2 = rankdata(array2, method='dense')
    
    # Get rank sort order
    sort1 = np.argsort(rank1)
    sort2 = np.argsort(rank2)
    
    # Calculate avgerage value for each rank
    ## sort original values by rank order
    ## take mean for each position
    avg = np.mean([array1[sort1], array2[sort2]], axis=0)
    
    # replace original values with rank average
    array1[sort1] = avg
    array2[sort2] = avg
    
    # Return the difference
    return array1 - array2

# Compare Forward Strand

In [24]:
# Load Zhenxia's bigwigs
zbw = pyBigWig.open('../../output/312_sample_golden_set_2016-06-14.Sf.deeptools.merged.bw', 'r')

In [25]:
# Load Magic's bigwigs
mbw = pyBigWig.open('/data/MiegNCBI/bigwig/droso.magic.318_runs.strand_plus.unique_complete.bw', 'r')

In [6]:
# Create a bigwig for the difference
dbw = pyBigWig.open('../../output/diff_magic_zhenxia_318_golden_set_2016-06-14_plus.bw', 'w')

# add chromosome header to new bigwig file
chroms = sorted([(k, v) for k, v in mbw.chroms().items()], key=lambda x: x[0])
dbw.addHeader(chroms)

In [7]:
# Make sure we have the same chromosomes in Hisat2 and Magic
assert zbw.chroms() == mbw.chroms()

In [8]:
# Iterate over each chromosome, normalize, and take the difference
for ch, l in chroms:
    diffs = normalize(zbw.values(ch, 0, l), mbw.values(ch, 0, l))
    chrom = [ch, ] * len(diffs)
    start = np.array(range(0, len(diffs)))
    end = start + 1
    dbw.addEntries(chrom, start.tolist(), ends=end.tolist(), values=diffs.tolist())

In [9]:
# Close the diff bigwig file
dbw.close()

In [26]:
# Print basic stats for each file
print('Hisat2 Forward Strand Basic Stats')
print(zbw.header())

print('\nMagic Forward Strand Basic Stats')
print(mbw.header())

dbw = pyBigWig.open('../../output/diff_magic_zhenxia_318_golden_set_2016-06-14_plus.bw', 'r')
print('\nDifference Forward Strand Basic Stats')
print(dbw.header())

Hisat2 Forward Strand Basic Stats
{'minVal': 1, 'maxVal': 8829180, 'version': 4, 'sumData': 384498468270, 'sumSquared': 243219925029647520, 'nBasesCovered': 130771288, 'nLevels': 10}

Magic Forward Strand Basic Stats
{'minVal': 0, 'maxVal': 6660606, 'version': 4, 'sumData': 318486315892, 'sumSquared': 157557043153038208, 'nBasesCovered': 139294268, 'nLevels': 10}

Difference Forward Strand Basic Stats
{'minVal': -6263941, 'maxVal': 7067759, 'version': 4, 'sumData': 0, 'sumSquared': 29212293615965500, 'nBasesCovered': 137547960, 'nLevels': 8}


In [None]:
# Close all of the open BigWig Files
dbw.close()
zbw.close()
mbw.close()

# Compare Reverse Strand

In [20]:
# Load Zhenxia's bigwigs
zbw = pyBigWig.open('../../output/312_sample_golden_set_2016-06-14.Sr.deeptools.merged.bw', 'r')

In [21]:
# Load Magic's bigwigs
mbw = pyBigWig.open('/data/MiegNCBI/bigwig/droso.magic.318_runs.strand_minus.unique_complete.bw', 'r')

In [14]:
# Create a bigwig for the difference
dbw = pyBigWig.open('../../output/diff_magic_zhenxia_318_golden_set_2016-06-14_minus.bw', 'w')

# add chromosome header to new bigwig file
chroms = sorted([(k, v) for k, v in mbw.chroms().items()], key=lambda x: x[0])
dbw.addHeader(chroms)

In [15]:
# Make sure we have the same chromosomes in Hisat2 and Magic
assert zbw.chroms() == mbw.chroms()

In [16]:
# Iterate over each chromosome, normalize, and take the difference
for ch, l in chroms:
    diffs = normalize(zbw.values(ch, 0, l), mbw.values(ch, 0, l))
    chrom = [ch, ] * len(diffs)
    start = np.array(range(0, len(diffs)))
    end = start + 1
    dbw.addEntries(chrom, start.tolist(), ends=end.tolist(), values=diffs.tolist())

In [17]:
# Close the diff bigwig file
dbw.close()

In [23]:
# Print basic stats for each file
print('Hisat2 Forward Strand Basic Stats')
print(zbw.header())

print('\nMagic Forward Strand Basic Stats')
print(mbw.header())

dbw = pyBigWig.open('../../output/diff_magic_zhenxia_318_golden_set_2016-06-14_minus.bw', 'r')
print('\nDifference Forward Strand Basic Stats')
print(dbw.header())

Hisat2 Forward Strand Basic Stats
{'minVal': 1, 'maxVal': 10248600, 'version': 4, 'sumData': 381893625429, 'sumSquared': 274580790932070464, 'nBasesCovered': 129618008, 'nLevels': 10}

Magic Forward Strand Basic Stats
{'minVal': 0, 'maxVal': 7414655, 'version': 4, 'sumData': 321505643003, 'sumSquared': 158525168523731008, 'nBasesCovered': 139294397, 'nLevels': 10}

Difference Forward Strand Basic Stats
{'minVal': -4054536, 'maxVal': 6817810, 'version': 4, 'sumData': 0, 'sumSquared': 49636485695617072, 'nBasesCovered': 137547960, 'nLevels': 8}


In [19]:
# Close all of the open BigWig Files
dbw.close()
zbw.close()
mbw.close()