# Pseudorep JSD

Compute JSD for pseudoreps.

In [19]:
import pyBigWig
import pandas as pd
import numpy as np
import scipy.spatial

In [5]:
def get_cts(peaks_df, bw, width):
    """
    Fetches values from a bigwig bw, given a df with minimally
    chr, start and summit columns. Summit is relative to start.
    Retrieves values of specified width centered at summit.

    "cts" = per base counts across a region
    """
    vals = []
    for i, r in peaks_df.iterrows():
        vals.append(np.nan_to_num(bw.values(r['chr'], 
                                            r['start'] + r['summit'] - width//2,
                                            r['start'] + r['summit'] + width//2)))
        
    return np.array(vals)

In [6]:
def get_jsd(preds, cts, min_tot_cts=10):
    return np.array([scipy.spatial.distance.jensenshannon(x,y) for x,y in zip(preds, cts) \
                     if y.sum()>min_tot_cts])

In [7]:
def get_pseudoreps(cts):
    """
    cts: B x 2000 matrix
    
    subsample each position twice
    """
#     pr1 = np.random.binomial(cts, 0.5)
#     pr2 = cts - pr1 ### THIS MAKES THE 2 PRs dependent
    return np.random.binomial(cts, 0.5), np.random.binomial(cts, 0.5)

In [10]:
# def get_pseudoreps_exclusive(cts):
#     """
#     cts: B x 2000 matrix
    
#     subsample each position twice
    
#     """
#     pr1 = np.random.binomial(cts, 0.5)
#     pr2 = cts - pr1 ### THIS MAKES THE 2 PRs dependent 
#     return pr1, pr2

In [12]:
NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

In [21]:
jsds = {}

for i in range(1,16):
    peaks_df = pd.read_csv("/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/bpnet/data/20210818_n62599/peaks/overlap_merged/cluster_idx{}.bed".format(i),
                       names=NARROWPEAK_SCHEMA, sep='\t')
    sample_peaks_df = peaks_df.sample(20000)
    with pyBigWig.open("/oak/stanford/groups/akundaje/surag/projects/scATAC-reprog/clusters/20210714_n64913/bigwigs/cluster_idx{}.bw".format(i)) as bw:
        cts = get_cts(sample_peaks_df, bw, 2000).astype(np.int64)
    
    jsds[i] = np.median(get_jsd(*get_pseudoreps(cts)))
    
    print(i, jsds[i])

1 0.38008345006127797
2 0.44081757743975425
3 0.5119384921254924
4 0.4431574950055273
5 0.518887105073189
6 0.4885494335006828
7 0.5205852675584086
8 0.39278052245478023
9 0.5300979356674591
10 0.5272961203990402
11 0.49774233186551353
12 0.4833026672229039
13 0.5197846249914942
14 0.4431115853788992
15 0.5233775458778936


In [26]:
df = pd.DataFrame({"cluster":list(jsds.keys()), "jsd":list(jsds.values())})
df

Unnamed: 0,cluster,jsd
0,1,0.380083
1,2,0.440818
2,3,0.511938
3,4,0.443157
4,5,0.518887
5,6,0.488549
6,7,0.520585
7,8,0.392781
8,9,0.530098
9,10,0.527296


In [None]:
df.to_csv("./pseudorep.tsv", sep='\t', index=False)