Brian asked for:

>Hi,
If you can easily query for all Dmel genes with one transcript and no introns and send that to Astrid, that would be great.  Cc me.
B

In [1]:
# %load ../start.py
# Load useful extensions
import os
import sys

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
sys.path.insert(0, '../../lcdb-wf/lib')
sys.path.insert(0, '../../lib/python')

# Set up references
import yaml
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)

assembly = config['assembly']
tag = config['aligner']['tag']
REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)


last updated: 2017-09-20 

CPython 3.5.2
IPython 6.1.0
Git hash: 6b3b5c6070b965c364a696bb9e37f7c8a5693542


In [2]:
# Imports
import gffutils
import pandas as pd

In [3]:
# Connect to gff dtabase
db = gffutils.FeatureDB('../../output/dmel-all-no-analysis-r6.16.gff.db')

In [6]:
sorted(list(db.featuretypes()))

['BAC_cloned_genomic_insert',
 'CDS',
 'DNA_motif',
 'RNAi_reagent',
 'TF_binding_site',
 'breakpoint',
 'chromosome',
 'chromosome_band',
 'complex_substitution',
 'deletion',
 'enhancer',
 'exon',
 'exon_junction',
 'five_prime_UTR',
 'gene',
 'golden_path_region',
 'insertion_site',
 'insulator',
 'intron',
 'mRNA',
 'mature_peptide',
 'miRNA',
 'modified_RNA_base_feature',
 'ncRNA',
 'oligonucleotide',
 'origin_of_replication',
 'orthologous_region',
 'orthologous_to',
 'pcr_product',
 'point_mutation',
 'polyA_site',
 'pre_miRNA',
 'protein',
 'protein_binding_site',
 'pseudogene',
 'rRNA',
 'region',
 'regulatory_region',
 'repeat_region',
 'rescue_fragment',
 'sequence_variant',
 'silencer',
 'snRNA',
 'snoRNA',
 'syntenic_region',
 'tRNA',
 'tandem_repeat',
 'three_prime_UTR',
 'transcription_start_site',
 'transposable_element',
 'transposable_element_insertion_site',
 'uncharacterized_change_in_nucleotide_sequence']

In [11]:
# Check how many genes and mRNAs there are
genes = list(db.features_of_type('gene'))
mRNAs = list(db.features_of_type('mRNA'))
ncRNAs = list(db.features_of_type('ncRNA'))

In [12]:
print("""
# genes:  {:,}
# mRNAs:  {:,}
# ncRNAs: {:,}
""".format(len(genes), len(mRNAs), len(ncRNAs)))


# genes:  17,730
# mRNAs:  30,490
# ncRNAs: 2,927



In [40]:
# Iterate over genes and pull out mRNAs
# Check if genes have a single transcript without introns.
intronless = set()
for gene in db.features_of_type('gene'):
    mRNA = list(db.children(gene, featuretype='mRNA'))
    if len(mRNA) == 1:
        introns = list(db.children(mRNA[0], featuretype='intron'))
        if len(introns) == 0:
            intronless |= set([gene.id])

print('In FlyBase r6.16 there are {:,} mRNAs that have a '
      'single transcript with no introns.'.format(len(intronless)))

In FlyBase r6.16 there are 1,803 mRNAs that have a single transcript with no introns.


In [42]:
# Iterate over genes and pull out mRNAs
# Check if genes have a single transcript without introns.
intronless_nc = set()
for gene in db.features_of_type('gene'):
    ncRNA = list(db.children(gene, featuretype='ncRNA'))
    if len(ncRNA) == 1:
        introns = list(db.children(ncRNA[0], featuretype='intron'))
        if len(introns) == 0:
            intronless_nc |= set([gene.id])

print('In FlyBase r6.16 there are {:,} ncRNAs that have a '
      'single transcript with no introns.'.format(len(intronless_nc)))

In FlyBase r6.16 there are 1,517 ncRNAs that have a single transcript with no introns.


In [37]:
# Import annotations
anno = pd.read_csv(os.path.join(REF, 'fb_annotation/{}_{}.fb_annotation'.format(assembly, tag)), sep='\t')

In [55]:
anno.set_index('primary_FBgn', inplace=True)

In [43]:
dfM = pd.DataFrame(['mRNA']*len(intronless), index=intronless, columns=['type'])

In [44]:
dfN = pd.DataFrame(['ncRNA']*len(intronless_nc), index=intronless_nc, columns=['type'])

In [79]:
df = pd.concat([dfM, dfN])

In [76]:
df.shape

(3320, 1)

In [80]:
dd = df.merge(anno, left_index=True, right_index=True, how='left')

In [81]:
dd.index.name = 'FBgn'
dd['linkout'] = dd.index.map(lambda x: 'http://flybase.org/cgi-bin/gbrowse2/dmel/?Search=1;name={0}'.format(x)).values

In [82]:
dd.reset_index().to_csv('../../output/astrid_list.tsv', sep='\t', index=False)