## Reannotates S. cerevisiae gff using empirically discovered 3'UTRs from Nagalakshmi et al 2008. 

In [28]:
import os
import pandas as pd
import gffutils

In [29]:
#these functions change the text in the attributes column after assigning a parent/child
#relationship in a gffutils database. 
def parent_func(parent, child):
    #print('parent_func(%r, %r)' % (parent, child))
    parent.attributes['child'] = child.id
    
    return parent
    
def child_func(parent, child):
    #print('child_func(%r, %r)' % (parent, child))
    child.attributes['Parent'] = parent.id
    
    return child

In [30]:
#Load original gff data and establish basis for filenames and directory for files.
genome_assy_dir = os.path.normpath("/home/heineike/genomes/scer_20181114")
#os.path.normpath('C:\\Users\\BMH_work\\Google Drive\\UCSF\\ElSamad_Lab\\PKA\\Bioinformatics\\genome_assembly')

sc_ref_base = 'saccharomyces_cerevisiae_R64-2-1_20150113' #'scer_ref_test'
sc_ref_fn = genome_assy_dir + os.sep + sc_ref_base + '.gff'

utr3p_fn = genome_assy_dir + os.sep + 'Nagalakshmi_2008_3UTRs_V64.gff3'#' Nag_gff_test'




## Change chromosome name in original Scer alignment file



In [32]:
#Loads the original gff as a database

#merged_db.conn.close()    #when troubleshooting you may need to close the database before remaking it. 

orig_gff_db_fn = genome_assy_dir+os.sep + sc_ref_base + '.gff'

orig_gff_db = gffutils.create_db(orig_gff_db_fn, dbfn=genome_assy_dir + os.sep + sc_ref_base + '_orig.db', force=True, keep_order=True, 
                        merge_strategy='merge', sort_attribute_values=True)

In [33]:
#renames all chromosomes to match the name that the SAM files from lexogen use. 
roman_numerals = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI']
chromosome_rename_dict = {'chr' + num : num for num in roman_numerals} 
chromosome_rename_dict['chrmt']='Mito'

for old_chr, new_chr in chromosome_rename_dict.items():
    print('Old Chromosome name: ' + old_chr + '.  New Chromosome name: ' + new_chr)
    orig_gff_db.execute("update features set seqid='{}' where seqid='{}'".format(new_chr, old_chr))

Old Chromosome name: chrmt.  New Chromosome name: Mito
Old Chromosome name: chrIV.  New Chromosome name: IV
Old Chromosome name: chrXIV.  New Chromosome name: XIV
Old Chromosome name: chrXIII.  New Chromosome name: XIII
Old Chromosome name: chrVIII.  New Chromosome name: VIII
Old Chromosome name: chrXI.  New Chromosome name: XI
Old Chromosome name: chrXV.  New Chromosome name: XV
Old Chromosome name: chrX.  New Chromosome name: X
Old Chromosome name: chrI.  New Chromosome name: I
Old Chromosome name: chrIII.  New Chromosome name: III
Old Chromosome name: chrXVI.  New Chromosome name: XVI
Old Chromosome name: chrVII.  New Chromosome name: VII
Old Chromosome name: chrXII.  New Chromosome name: XII
Old Chromosome name: chrV.  New Chromosome name: V
Old Chromosome name: chrVI.  New Chromosome name: VI
Old Chromosome name: chrII.  New Chromosome name: II
Old Chromosome name: chrIX.  New Chromosome name: IX


In [34]:
#Print to file
with open(genome_assy_dir+os.sep + sc_ref_base + '_chr_rename.gff', 'w') as outfile:
    outfile.write('##gff-version 3')
    for feature in orig_gff_db.all_features():
         print(feature,file=outfile)

In [35]:
#after the file was printed manually removed an open quote that was contained in line 18642:
#line = 'XIV	SGD	gene	555048	556832	.	+	.	ID=YNL039W;dbxref=SGD:S000004984;Name=YNL039W;Note=Essential subunit of RNA polymerase III transcription factor (TFIIIB)%3B TFIIIB is involved in transcription of genes encoding tRNAs%2C 5S rRNA%2C U6 snRNA%2C and other small RNAs;display=Essential subunit of RNA polymerase III transcription factor (TFIIIB);Ontology_term=GO:0000126,GO:0001026,GO:0001112,GO:0001156,GO:0070896,GO:0070898;orf_classification=Verified;gene=BDP1;Alias=B",BDP1,TFC5,TFC7,TFIIIB90,transcription factor TFIIIB subunit BDP1'
#YNL039W, BDP1 changed B" alias to Bprimeprime



with open(genome_assy_dir+os.sep + sc_ref_base + '_chr_rename.gff', 'r') as f:
    lines = f.readlines()

with open(genome_assy_dir+os.sep + sc_ref_base + '_chr_rename.gff', 'w') as f:
    for line in lines:
        if 'Alias=B"' in line:
            line_split = line.split('Alias=B"')
            line = line_split[0] + 'Alias=Bprimeprime' + line_split[1]
        f.write(line)

In [36]:
#If the output looked good, commit
orig_gff_db.conn.commit()

## Add nag annotation to SGD annotation file. 

In [6]:
#merges two gtf files and outputs to new file

with open(genome_assy_dir+os.sep + sc_ref_base + '_nagdata.gff', 'w') as outfile:
    with open(sc_ref_fn) as sc_ref_file:
        #keep first line: 
        outfile.write(sc_ref_file.readline())
        #drop header lines
        for jj in range(0,17):
            #print(sc_ref_file.readline())
            sc_ref_file.readline()
        for line in sc_ref_file: 
            if line[0:3]=='chr':
                outfile.write(line)
            else: 
                break
    #add on UTR data
    with open(utr3p_fn) as utr3p_file:    
        #drop header lines
        for  jj in range(0,42):
            #print(utr3p_file.readline())
            utr3p_file.readline()
        for line in utr3p_file:
            outfile.write(line)

In [7]:
#Loads the merged.gff as a database

#merged_db.conn.close()    #when troubleshooting you may need to close the database before remaking it. 

merged_fn = genome_assy_dir+os.sep + sc_ref_base + '_nagdata.gff'

merged_db = gffutils.create_db(merged_fn, dbfn=genome_assy_dir + os.sep + sc_ref_base + '_nagdata.db', force=True, keep_order=True, 
                        merge_strategy='merge', sort_attribute_values=True)

In [None]:
#merged_db.conn.close()    #when troubleshooting you may need to close the database before remaking it. 

#Makes UTRs parents
for utr_3p in merged_db.features_of_type('three_prime_UTR'):
    gene_id = utr_3p.id.split('_')[0]
    #print(gene_id)
    #print(utr_3p.id)
 
    try:
        merged_db.add_relation(gene_id,utr_3p, 1, child_func = child_func, parent_func=parent_func)
#        print(utr_3p.attributes)
    except gffutils.FeatureNotFoundError:
        print('There is no matching orf for the 3prime UTR ' + gene_id)

In [9]:
with open(genome_assy_dir+os.sep + sc_ref_base + '_nagdata_UTRchildren.gff', 'w') as outfile:
    outfile.write('##gff-version 3')
    for feature in merged_db.all_features():
         print(feature,file=outfile)

In [12]:
#Uses merge_Nag_scer64.sh to 
#sort combined UTR and annotation file and then merges the coordinates of the UTR and previous gene to get new coordinates for gene. 
#with bedtools
merge_cmd = ['/home/heineike/github/UTR_annotation/UTR_annotation/merge_Nag_scerR64.sh',
             '/home/heineike/genomes/scer_20181114/saccharomyces_cerevisiae_R64-2-1_20150113']

os.system(' '.join(merge_cmd))


0

In [13]:
#build dict of coordinates that need to be changed
merge_table = pd.read_table(genome_assy_dir+os.sep + sc_ref_base + '_nagdata_UTRchildren_merged', header = None)

coord_change_dict = {}

for row in merge_table.iterrows():
    annotation = row[1][5]
    annotation_ids = [item.split("=")[1] for item in annotation.split(";") if item.split("=")[0]=="ID"]
    for ann_id in annotation_ids: 
        if '_' in ann_id:
            if '3UTR' == ann_id.split('_')[1]:
                gene_id = ann_id.split('_')[0]
                coord_change = {}
    
                #for some reason the start coordinate for merged items on the + strand
                #had one number subtracted in bedtools coord_change['start'] = row[1][1]+1
                if row[1][3]=='+':
                    coord_change['start'] = row[1][1]+1
                elif row[1][3]=='-':
                    coord_change['start'] = row[1][1]
                coord_change['end'] = row[1][2]
                coord_change['UTR_id'] = ann_id

                coord_change_dict[gene_id] = coord_change


#coord_change_dict

In [14]:
#load new database that is sorted from bedtools
#merged_sorted_db.conn.close()   #when troubleshooting may need to close database before reloading

merged_sorted_fn = genome_assy_dir+os.sep + sc_ref_base + '_nagdata_UTRchildren_sorted.gff'


merged_sorted_db = gffutils.create_db(merged_sorted_fn, dbfn=genome_assy_dir + os.sep + sc_ref_base + '_nagdata_UTRchildren_sorted.db', force=True, keep_order=True, 
                        merge_strategy='merge', sort_attribute_values=True)

# merged_sorted_db.schema()
# cursor = merged_sorted_db.execute("select id from features where seqid = 'I'")
# row = cursor.fetchone()

In [15]:
#renames all chromosomes to match the name that the SAM files from lexogen use. 
roman_numerals = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI']
chromosome_rename_dict = {'chr' + num : num for num in roman_numerals} 
chromosome_rename_dict['chrmt']='Mito'

for old_chr, new_chr in chromosome_rename_dict.items():
    print('Old Chromosome name: ' + old_chr + '.  New Chromosome name: ' + new_chr)
    merged_sorted_db.execute("update features set seqid='{}' where seqid='{}'".format(new_chr, old_chr))

Old Chromosome name: chrmt.  New Chromosome name: Mito
Old Chromosome name: chrIV.  New Chromosome name: IV
Old Chromosome name: chrXIV.  New Chromosome name: XIV
Old Chromosome name: chrXIII.  New Chromosome name: XIII
Old Chromosome name: chrVIII.  New Chromosome name: VIII
Old Chromosome name: chrXI.  New Chromosome name: XI
Old Chromosome name: chrXV.  New Chromosome name: XV
Old Chromosome name: chrX.  New Chromosome name: X
Old Chromosome name: chrI.  New Chromosome name: I
Old Chromosome name: chrIII.  New Chromosome name: III
Old Chromosome name: chrXVI.  New Chromosome name: XVI
Old Chromosome name: chrVII.  New Chromosome name: VII
Old Chromosome name: chrXII.  New Chromosome name: XII
Old Chromosome name: chrV.  New Chromosome name: V
Old Chromosome name: chrVI.  New Chromosome name: VI
Old Chromosome name: chrII.  New Chromosome name: II
Old Chromosome name: chrIX.  New Chromosome name: IX


In [16]:
#Moves start and end locations for each gene per new file
jj = 0
for gene_id, coord_change in coord_change_dict.items():
    new_start = coord_change['start']
    new_end = coord_change['end']
    #prints out update statement every 1000 iterations. 
    jj = jj + 1
    if jj==1000:
        print("update features set end={} where id = '{}'".format(new_end,gene_id))
        jj = jj-1000
    merged_sorted_db.execute("update features set start={} where id = '{}'".format(new_start,gene_id))
    merged_sorted_db.execute("update features set end={} where id = '{}'".format(new_end,gene_id))

update features set end=68527 where id = 'YNL299W'
update features set end=509280 where id = 'YMR120C'
update features set end=818438 where id = 'YOR262W'
update features set end=744305 where id = 'YOR212W'
update features set end=51002 where id = 'YBL089W'


In [17]:
#Print to file
with open(genome_assy_dir+os.sep + sc_ref_base + '_UTRs.gff', 'w') as outfile:
    outfile.write('##gff-version 3\n')
    for feature in merged_sorted_db.all_features():
         print(feature,file=outfile)

#In the backup file the child tag was at the end of many of the lines.  This most recent time I ran it, 
#the child tag was in the middle. Also the size was slightly different because the backup had windows CR LF instead of unix LF

In [25]:
#after the file was printed manually removed an open quote that was contained in line 22795:
#line = 'XIV	SGD	gene	555048	556886	.	+	.	ID=YNL039W;dbxref=SGD:S000004984;Name=YNL039W;Note=Essential subunit of RNA polymerase III transcription factor (TFIIIB)%3B TFIIIB is involved in transcription of genes encoding tRNAs%2C 5S rRNA%2C U6 snRNA%2C and other small RNAs;display=Essential subunit of RNA polymerase III transcription factor (TFIIIB);Ontology_term=GO:0000126,GO:0001026,GO:0001112,GO:0001156,GO:0070896,GO:0070898;orf_classification=Verified;child=YNL039W_3UTR;gene=BDP1;Alias=B",BDP1,TFC5,TFC7,TFIIIB90,transcription factor TFIIIB subunit BDP1'
#YNL039W, BDP1 changed B" alias to Bprimeprime

with open(genome_assy_dir+os.sep + sc_ref_base + '_UTRs.gff', 'r') as f:
    lines = f.readlines()

with open(genome_assy_dir+os.sep + sc_ref_base + '_UTRs.gff', 'w') as f:
    for line in lines:
        if 'Alias=B"' in line:
            line_split = line.split('Alias=B"')
            line = line_split[0] + 'Alias=Bprimeprime' + line_split[1]
        f.write(line)


    

In [18]:
#If the output looked good, commit and print to file
merged_sorted_db.conn.commit()

In [26]:
#some useful commands to access / change data in gffutils. 

#db_id = 'YEL058W'
#cursor = merged_db.execute('select * from features where id ="%s"' % db_id)
#row = cursor.fetchone()
#row['end']

#feat = list(merged_db.features_of_type('three_prime_UTR'))[0]


#pd.read_sql('select * from features;', merged_db.conn)
