In [1]:
import os
from pathlib import Path

import gffutils
import pandas as pd
import sqlite3

out_dir = "output"
Path(out_dir).mkdir(exist_ok=True)

## Create GFF databases and read in features

In [2]:
def create_db(path, db_filename, **kwargs):
    """
    Create/connect db for given gff file path
    """
    if not os.path.isfile(os.path.join(out_dir, db_filename)):
        db = gffutils.create_db(path, os.path.join(out_dir, db_filename), force=True, **kwargs)
    else:
        db = sqlite3.connect(os.path.join(out_dir, db_filename), check_same_thread=False)
        db = gffutils.FeatureDB(db)
    return db

# WormBase release WS283 canonical three_prime_UTR for C. elegans chromosome I
control_fn = "WormBase.I.three_prime_UTR.gff3"
db_control = create_db(control_fn, "control.db")
control = list(db_control.all_features())

# Output from GETUTR run with PAVA smoothing and s >= 0.95
test_fn = "getutr_output.gff3"
db_test = create_db(test_fn, "test_getutr.db")
test = list(db_test.all_features())

# WormBase release WS283 canonical mRNA transcripts for C. elegans chromosome I
genes_fn = "WormBase.I.mRNA.gff3"
db_genes = create_db(genes_fn, "genes.db")
genes = list(db_genes.all_features())

## Prepare control dataset

C. elegans contains alternative mRNA transcripts for each gene, and the control UTRs refer only to these mRNA transcript "names" in their attributes. GETUTR UTRs are represented by fragmented "sequence_features", each with an associated "s" score and referring only to these mRNA transcript "names" in their attributes. Therefore, it will be helpful to create a sequence name -> gene id mapping dataframe to allow comparisions between the two datasets.

In [3]:
gdf = pd.DataFrame([vars(c) for c in genes], columns=["featuretype", "start", "end", "strand", "attributes"])
gdf = gdf[gdf["featuretype"] == "mRNA"]
gdf["gene"] = gdf["attributes"].apply(lambda x: x.get("Parent")[0].split(":")[-1])
gdf["name"] = gdf["attributes"].apply(lambda x: x.get("Name")[0])
gdf = gdf[["name", "gene"]]
gdf

Unnamed: 0,name,gene
0,Y74C9A.3.1,WBGene00022277
1,Y74C9A.2a.3,WBGene00022276
2,Y74C9A.2a.1,WBGene00022276
3,Y74C9A.2a.2,WBGene00022276
4,Y74C9A.2b.1,WBGene00022276
...,...,...
4698,F31C3.1.1,WBGene00000881
4699,F31C3.3.1,WBGene00009285
4700,F31C3.4.1,WBGene00009286
4701,F31C3.5.1,WBGene00009287


Create a dataframe of canonical WormBase control UTRs with a "name" column consisting of the parent gene's name.

In [4]:
cdf = pd.DataFrame([vars(c) for c in control], columns=["start", "end", "source", "strand", "attributes"])
cdf = cdf[cdf.source == "WormBase"]
cdf['name'] = cdf["attributes"].apply(lambda x: x.get("Parent")[0].split(":")[-1])
cdf = cdf.drop(['attributes', "source"], axis=1)
cdf = pd.merge(cdf, gdf, on="name")
cdf

Unnamed: 0,start,end,strand,name,gene
0,4116,4220,-,Y74C9A.3.1,WBGene00022277
1,16586,16833,+,Y74C9A.2a.2,WBGene00022276
2,16586,16837,+,Y74C9A.2a.1,WBGene00022276
3,16702,16793,+,Y74C9A.2a.3,WBGene00022276
4,17484,17910,-,Y74C9A.4a.1,WBGene00022278
...,...,...,...,...,...
4192,15040474,15040608,-,F31C3.1.1,WBGene00000881
4193,15041433,15041884,-,F31C3.3.1,WBGene00009285
4194,15050797,15051145,-,F31C3.4.1,WBGene00009286
4195,15053608,15053793,-,F31C3.6a.1,WBGene00009288


Aggregate transcripts for minimum start base and maximum end base to capture full extent of the UTR for each gene.

In [5]:
control_utrs = cdf.groupby(["gene", "strand"]).agg({"start": min, "end": max}).reset_index("strand")
control_utrs

Unnamed: 0_level_0,strand,start,end
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WBGene00000001,+,5109948,5110183
WBGene00000006,-,2571740,2571987
WBGene00000010,-,11388682,11389140
WBGene00000012,-,3886070,3886110
WBGene00000020,-,2255921,2256151
...,...,...,...
WBGene00304996,+,5341995,5342045
WBGene00305018,-,9588662,9588719
WBGene00306080,-,2939477,2939518
WBGene00306081,+,6872119,6872142


## Prepare test dataset

Create a dataframe of GETUTR test UTRs with a "gene" column consisting of the parent gene's name. We will aggregate the "sequence_features" for each gene name for minimum start base and maximum end base to capture the full extent of the UTR for that gene.

In [30]:
tdf = pd.DataFrame([vars(t) for t in test], columns=["start", "end", "strand", "attributes"])
tdf['name'] = tdf["attributes"].apply(lambda x: x.get("name")[0])
tdf = tdf.drop(['attributes'], axis=1)
tdf = pd.merge(tdf, gdf, on="name")
test_utrs = tdf.groupby(["strand", "gene"]).agg({"start": min, "end": max}).reset_index("strand")
test_utrs

Unnamed: 0_level_0,strand,start,end
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WBGene00000001,+,5109949,5110316
WBGene00000023,+,12018641,12018859
WBGene00000079,+,7774293,7777525
WBGene00000084,+,7607752,7609660
WBGene00000096,+,9531100,9533701
...,...,...,...
WBGene00303032,-,4647835,4650729
WBGene00303056,-,7706598,7706960
WBGene00303057,-,5862563,5864973
WBGene00303058,-,14049858,14050372


# Compare the two datasets

Now that we have two comparable datasets, we can calculate some statistical differences. We determine length of each UTR in both dataframes, and determine differences of test UTR length to true UTR length.

In [32]:
control_utrs["len"] = abs(control_utrs["end"] - control_utrs["start"])
test_utrs["len"] = abs(test_utrs["end"] - test_utrs["start"])
test_utrs["difference"] = (test_utrs["len"] - control_utrs["len"])
test_utrs

Unnamed: 0_level_0,strand,start,end,len,difference
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WBGene00000001,+,5109949,5110316,367,132.0
WBGene00000023,+,12018641,12018859,218,174.0
WBGene00000079,+,7774293,7777525,3232,3047.0
WBGene00000084,+,7607752,7609660,1908,1414.0
WBGene00000096,+,9531100,9533701,2601,2568.0
...,...,...,...,...,...
WBGene00303032,-,4647835,4650729,2894,
WBGene00303056,-,7706598,7706960,362,
WBGene00303057,-,5862563,5864973,2410,
WBGene00303058,-,14049858,14050372,514,


## Supplementary Table 2 statistics

In [33]:
total_test = len(test_utrs)
total_control = len(control_utrs)

genes_missed_utr = control_utrs.index.difference(test_utrs.index)
num_missed_utrs = len(genes_missed_utr)
genes_no_utr = set(gdf["gene"]).difference(control_utrs.index)
novel_utr_genes = genes_no_utr.intersection(test_utrs.index)
num_novel_utrs = len(novel_utr_genes)

BP1 = 50
BP2 = 200
num_matched =  len(test_utrs[abs(test_utrs["difference"]) <= BP1])
num_reduced = len(test_utrs[test_utrs['difference'] < -BP1])
num_extended1 = len(test_utrs[(test_utrs['difference'] > BP1) & (test_utrs['difference'] <= BP2)])
num_extended2 = len(test_utrs[test_utrs['difference'] > BP2])

print("Canonical utrs that are missed by GETUTR: {} ({:.2f}%)".format(num_missed_utrs, 100*num_missed_utrs/total_control))
print("GETUTR utrs that match true utr to within {} bases: {} ({:.2f}%)".format(BP1, num_matched, 100*num_matched/total_control))
print("GETUTR utrs that reduce true utr by >{} bases: {} ({:.2f}%)".format(BP1, num_reduced, 100*num_reduced/total_control))
print("GETUTR utrs that extend true utr by between {} and {} bases: {} ({:.2f}%)".format(BP1, BP2, num_extended1, 100*num_extended1/total_control))
print("GETUTR utrs that extend true utr by >{} bases: {} ({:.2f}%)".format(BP2, num_extended2, 100*num_extended2/total_control))
print("GETUTR utrs called where no true utr existed previously: {}".format(num_novel_utrs))
print("Total GETUTR UTRs: {}".format(total_test))

Canonical utrs that are missed by GETUTR: 568 (21.91%)
GETUTR utrs that match true utr to within 50 bases: 155 (5.98%)
GETUTR utrs that reduce true utr by >50 bases: 513 (19.78%)
GETUTR utrs that extend true utr by between 50 and 200 bases: 133 (5.13%)
GETUTR utrs that extend true utr by >200 bases: 1224 (47.20%)
GETUTR utrs called where no true utr existed previously: 151
Total GETUTR UTRs: 2176


Save list of gene ids with novel 3' UTRs called by GETUTR

In [29]:
with open(os.path.join(out_dir, "novel_utr_genes_getutr.txt"), "w") as f:
    for g in sorted(list(novel_utr_genes)):
        print(g)
        f.write(g + "\n")

WBGene00000013
WBGene00000612
WBGene00000628
WBGene00000632
WBGene00001067
WBGene00001614
WBGene00001640
WBGene00001641
WBGene00001745
WBGene00001941
WBGene00003977
WBGene00004196
WBGene00005046
WBGene00005048
WBGene00005049
WBGene00005050
WBGene00005051
WBGene00005524
WBGene00005528
WBGene00005529
WBGene00006044
WBGene00006047
WBGene00006399
WBGene00007638
WBGene00007658
WBGene00007708
WBGene00007770
WBGene00007854
WBGene00007855
WBGene00007976
WBGene00008161
WBGene00008188
WBGene00008291
WBGene00008381
WBGene00008475
WBGene00008479
WBGene00008529
WBGene00008662
WBGene00008855
WBGene00008885
WBGene00009128
WBGene00009251
WBGene00009252
WBGene00009263
WBGene00009359
WBGene00009362
WBGene00009607
WBGene00009646
WBGene00009667
WBGene00009696
WBGene00009960
WBGene00009961
WBGene00010291
WBGene00010401
WBGene00010568
WBGene00010577
WBGene00010586
WBGene00010740
WBGene00010821
WBGene00010857
WBGene00011234
WBGene00011651
WBGene00011658
WBGene00011659
WBGene00011780
WBGene00011781
WBGene0001