This notebook contains some test code to:
- annotate PAS with gene-IDs
- import those annotated bed files
- calculate PAS usage for every gene
- identify matching sites

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gffutils
from tqdm import tqdm

In [21]:
# fix chromosome ID for this particular input file
ttt = pd.read_csv('GSE66092_HeLa_No1No6_BC2_17652_GeneAssigned.bed', delimiter='\t', header=None, dtype={0: str})
ttt[0] = [i.strip('chr') for i in list(ttt[0])]
ttt.to_csv('HeLa.bed', sep='\t', header=None, index=False)

## annotate sites

- intersect the PAS bed file with the annotation bed that contains only gene features and their coordinates
- then import using pandas

```bash
bedtools intersect -wo -s -a HeLa.bed -b only_genes_hsap.bed > s1.bed

bedtools intersect -wo -s -a SRX351950.clusters.2.0.GRCh38.96.bed -b only_genes_hsap.bed > s2.bed
```

In [22]:
s1 = pd.read_csv('s1.bed', delimiter='\t', header=None, usecols=[0, 1, 2, 4, 5, 9], names=['chrom', 'chromStart', 'chromEnd', 'score', 'strand', 'name'], dtype={0: str})
s2 = pd.read_csv('s2.bed', delimiter='\t', header=None, usecols=[0, 1, 2, 4, 5, 14], names=['chrom', 'chromStart', 'chromEnd', 'score', 'strand', 'name'], dtype={0: str})

In [59]:
def match(cset1, cset2, min_overlap):
    '''
    Report which sets of coordinates overlap between two lists containing sets of coordinates.
    '''
    intersect = []
    # go through all pairs of sites
    for i in range(len(cset1)):        
        for j in range(len(cset2)):
            # calculate how many coordinate positions overlap
            isct = np.intersect1d(cset1[i], cset2[j])
            # in this case maximum is reported
            percent_overlap = max(len(isct)/len(cset1[i]), len(isct)/len(cset2[j]))
            if percent_overlap >= min_overlap:
                # output has indices of sites that match, percentage of overlap
                intersect.append((i, j, percent_overlap))
    return(intersect)

In [63]:
LEN = 0 # distance to extend PAS coordinates to help find overlap
OLP = 0.8 # minimum percent overlap needed to match sites

# initialize PAS usage column
usage = np.zeros(len(s1))+1

# pre-process dfs to contain only overlaping genes
#
#
#
#

s1_temp = s1.copy()
s2_temp = s2.copy()

# go through genes
for gene in tqdm(np.intersect1d(s1['name'], s2['name'])):
    # get sites for the respective gene
    gene_mask_s1 = np.array(s1_temp['name']==gene)
    gene_mask_s2 = np.array(s2_temp['name']==gene)
    
    v_gene_s1 = s1_temp[gene_mask_s1]    
    v_gene_s2 = s2_temp[gene_mask_s2]
    
    # drop the processed gene to speed up search
    s1_temp.drop(s1_temp[gene_mask_s1].index, inplace=True)
    s2_temp.drop(s2_temp[gene_mask_s2].index, inplace=True)
    
    # if either data set has all zero TPM values for that gene, skip
    if np.sum(v_gene_s1['score']) == 0 or np.sum(v_gene_s2['score']) == 0:
        continue
    
    # calculate PAS usage by normalizing TPM per gene
    usage_s1 = np.array(v_gene_s1['score'])/np.sum(v_gene_s1['score'])
    usage_s2 = np.array(v_gene_s2['score'])/np.sum(v_gene_s2['score'])
    
    coords_s1 = [np.arange(i['chromStart']-LEN, i['chromEnd']+1+LEN) for _, i in v_gene_s1.iterrows()]
    coords_s2 = [np.arange(i['chromStart']-LEN, i['chromEnd']+1+LEN) for _, i in v_gene_s2.iterrows()]
    
    # match the sites
    matched_sites = match(coords_s1, coords_s2, OLP)
    if len(matched_sites) > 2:
        break
    

#     sim.append(Jaccard(usage_s1[]))

    
    

  7%|▋         | 297/4532 [00:09<02:11, 32.13it/s]


In [64]:
matched_sites

[(0, 9, 1.0), (1, 12, 1.0), (2, 16, 1.0)]

In [65]:
usage_s1

array([0.2627866 , 0.45032334, 0.28689006])

In [66]:
usage_s2

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.0512825 ,
       0.        , 0.        , 0.74871772, 0.        , 0.        ,
       0.        , 0.19999978])

In [69]:
coords_s1

[array([1435133, 1435134]),
 array([1435298, 1435299]),
 array([1435684, 1435685])]

In [68]:
coords_s2

[array([1413822, 1413823]),
 array([1413829, 1413830, 1413831, 1413832, 1413833, 1413834, 1413835,
        1413836, 1413837, 1413838, 1413839, 1413840, 1413841, 1413842,
        1413843, 1413844, 1413845, 1413846, 1413847, 1413848, 1413849,
        1413850, 1413851, 1413852, 1413853, 1413854, 1413855, 1413856,
        1413857, 1413858]),
 array([1413884, 1413885, 1413886, 1413887, 1413888, 1413889, 1413890,
        1413891, 1413892, 1413893, 1413894, 1413895, 1413896, 1413897,
        1413898, 1413899, 1413900, 1413901, 1413902, 1413903, 1413904,
        1413905, 1413906, 1413907, 1413908, 1413909, 1413910, 1413911,
        1413912, 1413913, 1413914, 1413915, 1413916, 1413917]),
 array([1420459, 1420460]),
 array([1423689, 1423690, 1423691, 1423692, 1423693, 1423694, 1423695,
        1423696, 1423697, 1423698, 1423699, 1423700, 1423701, 1423702,
        1423703, 1423704, 1423705, 1423706, 1423707, 1423708, 1423709,
        1423710, 1423711, 1423712, 1423713, 1423714, 1423715, 1423716,
