In [1]:
import os
import pandas as pd
import gffutils

In [41]:
#add a 3_prime_UTR_estimate of N_ext bases as an exon for each gene
#number of bases to extend each annotated gene by (estimate of extended UTR)
N_ext = 300 #200 #400

In [42]:
#Load gff data and establish basis for filenames and directory for files.
genome_assy_dir = os.path.normpath('C:\\Users\\BMH_work\\Google Drive\\UCSF\\ElSamad_Lab\\PKA\\Bioinformatics\\genome_assembly\\klac\\')
kl_ref_base = 'Kluyveromyces_lactis.ASM251v1.32' #'scer_ref_test'
kl_ref_fn = genome_assy_dir + os.sep + kl_ref_base + '.gtf'


In [43]:
dbfn = genome_assy_dir + os.sep + kl_ref_base + '_' + str(N_ext) + '.db'

#Make new database
gtf_db = gffutils.create_db(kl_ref_fn, dbfn=dbfn, force=True, keep_order=True, 
                      merge_strategy='error', sort_attribute_values=True, disable_infer_transcripts=True, disable_infer_genes=True)

#could use other merge strategy - "warning", or "merge"
#might not want sort_attribute_values

# #open existing database
# gtf_db = gffutils.FeatureDB(dbfn)

In [44]:


#If it is an issue, I should come up with a clever way of counting the number of exons and adding this as an extra exon.  
#Since is is asking for a string, rather than an integer, I am just counting it as ext

#Might run into problems by adding coordinates past the ends of the chromosomes. 

#These items don't change for any of our added exons
score = '.'
frame = '.'
source = '3_prime_UTR_estimate'
featuretype = 'exon'

feature_list = []

for gene in gtf_db.all_features():
    if gene.featuretype == 'gene':
        #print(gene.id)
        #print(gene)
        chromosome = gene.seqid
        strand = gene.strand
        
        if strand == '-':
            start = gene.start-N_ext
            end = gene.end      
        elif strand == '+':
            start = gene.start
            end = gene.end + N_ext
        else: 
            raise ValueError('expected either + or -, got ' + strand)
        
        transcript_present = False
        for feat in list(gtf_db.children(gene.id)):
            if feat.featuretype == 'transcript':
                transcript = feat
                transcript_present = True

        new_attributes_dict = {
            'exon_number' : ['ext'],
            'gene_id' : [gene.id],
            'gene_source' : [gene.source] , 
            'gene_biotype' :  [gene['gene_biotype'][0]]

            }
        
        if transcript_present: 
            new_attributes_dict['transcript_source'] = [transcript.source] 
            new_attributes_dict['transcript_biotype'] = [transcript['transcript_biotype'][0]]
            new_attributes_dict['exon_id'] = [transcript.id + '-ext'] 
        else:
            print("No transcript for gene " + gene.id)
            new_attributes_dict['exon_id'] = [gene.id + '-ext'] 
            
        new_feature = gffutils.Feature(seqid=chromosome, source=source, featuretype=featuretype, start=start, end=end, score=score, strand=strand, frame=frame, attributes=new_attributes_dict)

        #print(new_feature)
        feature_list.append(new_feature)



In [45]:
print(len(feature_list))
for feature in feature_list:
    print(feature)

5534
F	3_prime_UTR_estimate	exon	2473	3819	.	+	.	exon_number=ext;gene_id=KLLA0_F00132g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG97792-ext
F	3_prime_UTR_estimate	exon	4320	6143	.	+	.	exon_number=ext;gene_id=KLLA0_F00154g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG97793-ext
F	3_prime_UTR_estimate	exon	6230	6748	.	-	.	exon_number=ext;gene_id=KLLA0_F00176g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG97794-ext
F	3_prime_UTR_estimate	exon	7833	8573	.	+	.	exon_number=ext;gene_id=KLLA0_F00220g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG97796-ext
F	3_prime_UTR_estimate	exon	8992	10560	.	+	.	exon_number=ext;gene_id=KLLA0_F00242g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;

E	3_prime_UTR_estimate	exon	891885	893201	.	+	.	exon_number=ext;gene_id=KLLA0_E10077g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG99491-ext
E	3_prime_UTR_estimate	exon	892971	893972	.	-	.	exon_number=ext;gene_id=KLLA0_E10099g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG99492-ext
E	3_prime_UTR_estimate	exon	894620	896809	.	+	.	exon_number=ext;gene_id=KLLA0_E10121g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG99493-ext
E	3_prime_UTR_estimate	exon	896299	899229	.	-	.	exon_number=ext;gene_id=KLLA0_E10143g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAG99494-ext
E	3_prime_UTR_estimate	exon	899451	900149	.	-	.	exon_number=ext;gene_id=KLLA0_E10165g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=p

C	3_prime_UTR_estimate	exon	1590436	1591629	.	-	.	exon_number=ext;gene_id=KLLA0_C17864g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH01849-ext
C	3_prime_UTR_estimate	exon	1591833	1592204	.	-	.	exon_number=ext;gene_id=KLLA0_C17886r;gene_source=ena;gene_biotype=tRNA;transcript_source=ena;transcript_biotype=tRNA;exon_id=KLLA0_C17886r-1-ext
C	3_prime_UTR_estimate	exon	1592133	1592504	.	+	.	exon_number=ext;gene_id=EBG00000981836;gene_source=Rfam;gene_biotype=tRNA;transcript_source=Rfam;transcript_biotype=tRNA;exon_id=EBT00001977631-ext
C	3_prime_UTR_estimate	exon	1592316	1592689	.	-	.	exon_number=ext;gene_id=KLLA0_C17908r;gene_source=ena;gene_biotype=tRNA;transcript_source=ena;transcript_biotype=tRNA;exon_id=KLLA0_C17908r-1-ext
C	3_prime_UTR_estimate	exon	1592949	1594123	.	+	.	exon_number=ext;gene_id=KLLA0_C17930g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH01

A	3_prime_UTR_estimate	exon	99292	100521	.	+	.	exon_number=ext;gene_id=KLLA0_A01045g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH02633-ext
A	3_prime_UTR_estimate	exon	99959	100867	.	-	.	exon_number=ext;gene_id=KLLA0_A01067g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH02634-ext
A	3_prime_UTR_estimate	exon	102036	103037	.	+	.	exon_number=ext;gene_id=KLLA0_A01089g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH02635-ext
A	3_prime_UTR_estimate	exon	102502	103431	.	-	.	exon_number=ext;gene_id=KLLA0_A01111g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=protein_coding;exon_id=CAH02636-ext
A	3_prime_UTR_estimate	exon	103746	104879	.	+	.	exon_number=ext;gene_id=KLLA0_A01133g;gene_source=ena;gene_biotype=protein_coding;transcript_source=ena;transcript_biotype=pro

In [39]:
gffutils.create._GFFDBCreator??

In [46]:
#If the feature list looks good, close the database, update it and reopen.  

#This takes forever - I feel like there must be a better way to update a database - maybe I could use an insert command

# insert_cmd = "INSERT INTO features (id, seqid, source, featuretype, start, end, score, strand, frame) values ('3_prime_UTR_estimate_2','A', '3_prime_UTR_estimate', 'exon', 100,200, '.', '-', '.')"
# gtf_db.conn.execute(insert_cmd)

gtf_db.conn.close()
gtf_db.update(feature_list)

gtf_db = gffutils.FeatureDB(dbfn)

21915 of 22136 (99%)

In [47]:
# for feature in gtf_db.all_features():
#     print(feature.id)
#     print(feature.featuretype)
#     print(feature)


with open(genome_assy_dir+os.sep + kl_ref_base + '_UTR_' + str(N_ext) + '.gtf', 'w') as outfile:
    outfile.write('#!genome-build ASM251v1\n' +
                  '#!genome-version ASM251v1\n' +
                  '#!genome-date 2015-02\n' +
                  '#!genome-build-accession GCA_000002515.1\n' + 
                  '#!genebuild-last-updated 2015-02\n' +
                  '# Added ' + str(N_ext) + ' bp extension exons for each gene as 3_prime_UTR_estimates. BMH 20181125\n')
    for feature in gtf_db.all_features():
         print(feature,file=outfile)
            

In [48]:
gtf_db.conn.close()

In [15]:
#some useful commands to access / change data in gffutils. 

#db_id = 'YEL058W'
#cursor = merged_db.execute('select * from features where id ="%s"' % db_id)
#row = cursor.fetchone()
#row['end']

#feat = list(merged_db.features_of_type('three_prime_UTR'))[0]


#pd.read_sql('select * from features;', merged_db.conn)


In [2]:
#these functions change the text in the attributes column after assigning a parent/child
#relationship in a gffutils database. 
def parent_func(parent, child):
    #print('parent_func(%r, %r)' % (parent, child))
    parent.attributes['child'] = child.id
    
    return parent
    
def child_func(parent, child):
    #print('child_func(%r, %r)' % (parent, child))
    child.attributes['Parent'] = parent.id
    
    return child