In [7]:
require 'net/http'
require 'bio'

# Define the Arabidopsis gene IDs
gene_ids = ['AT4g27030', 'AT5g54270', 'AT1g21400', 'AT2G46340']

# Loop over each gene ID
gene_ids.each do |gene_id|
  # Create the URI for fetching the gene from Ensembl
  address = URI("http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ensemblgenomesgene&format=embl&id=#{gene_id}")

  # Fetch the gene record from Ensembl
  response = Net::HTTP.get_response(address)
  record = response.body

  # Show a portion of the record
  puts record[1..100]

  # create a local file with this data
File.open('mygenes.embl', 'w') do |myfile|  # w makes it writable
  myfile.puts record
end

  puts "Trying Method 1a - directly create a Bio::EMBL object from string"
  puts " "
  # Create a Bio::EMBL object from the fetched record
  entry = Bio::EMBL.new(record)
  puts entry.class
  puts "The record is #{entry.definition}"
  
  # Your further processing logic here...
end


D   4    standard; DNA; HTG; 1258 BP.
XX
AC   chromosome:TAIR10:4:13571813:13573070:1
XX
SV   chromo
Trying Method 1a - directly create a Bio::EMBL object from string
 
Bio::EMBL
The record is Arabidopsis thaliana chromosome 4 TAIR10 partial sequence 13571813..13573070 annotated by Araport11
D   5    standard; DNA; HTG; 1404 BP.
XX
AC   chromosome:TAIR10:5:22038165:22039568:1
XX
SV   chromo
Trying Method 1a - directly create a Bio::EMBL object from string
 
Bio::EMBL
The record is Arabidopsis thaliana chromosome 5 TAIR10 partial sequence 22038165..22039568 annotated by Araport11
D   1    standard; DNA; HTG; 3529 BP.
XX
AC   chromosome:TAIR10:1:7493213:7496741:1
XX
SV   chromoso
Trying Method 1a - directly create a Bio::EMBL object from string
 
Bio::EMBL
The record is Arabidopsis thaliana chromosome 1 TAIR10 partial sequence 7493213..7496741 annotated by Araport11
D   2    standard; DNA; HTG; 5375 BP.
XX
AC   chromosome:TAIR10:2:19022154:19027528:1
XX
SV   chromo
Trying Method 1a - dir

["AT4g27030", "AT5g54270", "AT1g21400", "AT2G46340"]

In [43]:
require 'net/http'
require 'bio'
require 'fileutils'

# Define the Arabidopsis gene IDs
gene_ids = ['AT4g27030', 'AT5g54270', 'AT1g21400', 'AT2G46340']

# Create a directory to store the EMBL files
FileUtils.mkdir_p('genes_embl')

# Loop over each gene ID
gene_ids.each do |gene_id|
  # Create the URI for fetching the gene from Ensembl
  address = URI("http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ensemblgenomesgene&format=embl&id=#{gene_id}")

  # Fetch the gene record from Ensembl
  response = Net::HTTP.get_response(address)
  record = response.body

  # Create a local file with the Ensembl record
  File.open("genes_embl/#{gene_id}.embl", 'w') do |myfile|
    myfile.puts record
  end

  puts "EMBL file created for #{gene_id}: genes_embl/#{gene_id}.embl"
  
  
end

EMBL file created for AT4g27030: genes_embl/AT4g27030.embl
EMBL file created for AT5g54270: genes_embl/AT5g54270.embl
EMBL file created for AT1g21400: genes_embl/AT1g21400.embl
EMBL file created for AT2G46340: genes_embl/AT2G46340.embl


["AT4g27030", "AT5g54270", "AT1g21400", "AT2G46340"]

In [44]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position
    puts "\n\n\n\nPOSITION = #{position}"
    qual = feature.assoc

    # Get gene information and join it into a string
    gene_info = [
      qual['gene'], qual['product'], qual['note'], qual['function']
    ].compact.join(', ')
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq.splice(position)
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    if exon_sequence.match?('cttctt')
      puts "cttctt sequence found in #{gene_info}"
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence
    if exon_sequence.match?('aagaag')
      puts "aagaag sequence found in #{gene_info}"
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end



chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11




POSITION = complement(1743..2553)
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
agatatactaaagtcagagttgatatgcgaggatgattcagttaaatcgactgctgctgctgaggagatatctgagctgttacttcattttttgtcttcattagaagtgcagaagaagaaaaaagcatctaaacttttgcaagatatccagaccttagaggatgatattaaggaggccgagcgaagatattcttcaaatgtatctctggtgagatctcatggagctattgaaaaaagggtgcaatcatctcccctagacgagcactgtacaacttccagtgccttgtttgtaccaactgctaatacagacaggctgatgagtaatatccgtcaacttgaagatgcatattttttcatgcgatcacaaatcaacttatcgagttctgctgctactgcccgttctgacaaaacactaaaggacagggacagatgctctgaaaaccaaaatgaaaatcaggatatgagtaccaaaggaaaatcttcagatcaactcgaagtgtttttcgaggggttgtgcaaatttgctcggtatagcaagttcgaaacctgtgggacaataagaagtggggaccttttaaactctgcaagtgtggtctgctcgttgagttttgaccctgatgaagaacacatagcagcagctgggatatcaaagaagatcaagatttttgacttcaatgcatttatgaatgaatctgtcggtgttc

TypeError: no implicit conversion of nil into String

In [45]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i
    end_pos = Regexp.last_match[2].to_i

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[start_pos..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    if exon_sequence&.match?('cttctt')
      puts "cttctt sequence found in #{gene_info}"
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence
    if exon_sequence&.match?('aagaag')
      puts "aagaag sequence found in #{gene_info}"
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end


chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
tgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcagata

Exon Info: exon_id=AT5G54280.1.exon4


>NA splicing('complement(MDK4_5:46979..47129)') : exon_id=AT5G54280.1.exon4

cttctt sequence not found in exon_id=AT5G54280.1.exon4
aagaag sequence not found in exon_id=AT5G54280.1.exon4
Exon Info: exon_id=AT5G54280.1.exon9


>NA splicing('complement(MDK4_5:46032..46116)') : exon_id=AT5G54280.1.exon9

cttctt sequence not found in exon_id=AT5G54280.1.exon9
aagaag sequence not found in exon_id=AT5G54280.1.exon9
Exon Info: exon_id=AT5G54280.1.exon14


>NA splicing('complement(MDK4_5:44913..45002)') : exon_id=AT5G54280.1.exon14

cttctt sequence not found in exon_id=AT5G54280.1.exon14
aagaag sequence not found in exon_id=AT5G54280.1.exon14
Exon Info: exon_id=AT5G54280.1.exon6


>NA splicing('complement(MDK4_5:46551..46719)') : exon_id=AT5G54280.1.exon6

cttctt sequence not found in exon_id=AT5G54280.1.exon6
aagaag sequence not found in exon_id=AT5G54280.1.exon6
Exon Info: exon_id=AT5G54280.2.exon6


>NA splicing('complement(MDK4_5:46841..46868)') : exo

cttctt sequence found in exon_id=AT1G21400.2.exon1
aagaag sequence found in exon_id=AT1G21400.2.exon1
Exon Info: exon_id=AT1G21400.2.exon3


>NA splicing('1451..1633') : exon_id=AT1G21400.2.exon3
tgagtgagaaactcgctgttagaatgtacgaacaaatggcgacgctacaagtaatggatcacatcttctacgaagctcaacgtcaaggaagaatatctttttatcttacttccgtcggagaagaagccattaacatcgcttcagcagctgctctcagtcctgacgacgtcgttttacctcagg
cttctt sequence not found in exon_id=AT1G21400.2.exon3
aagaag sequence found in exon_id=AT1G21400.2.exon3
Exon Info: exon_id=AT1G21400.5.exon1


>NA splicing('1..513') : exon_id=AT1G21400.5.exon1
atttttttaatttttcaatagtatttgtccactataattagtgtaaaaactaaaaatcccaaaaagagaggaagataaataatcattttaagaaaagaaaaagaagttggagatggatacggatgacatgtcaagtccgacaaggaaatacatgcacttgtcgtccttctaatcctttgcccttaaacatcgttaaaaacgacaacttctctcttcttcttctctactcttctccattttcttctcttcttctcttctttgtccattttttgacagatatggcgatctggtttgctagatccaaaacccttgtttctagcttgagacataatctaaatttgtcgacaattctcataaaacgtgattactctcatcgtcccatcttctatacaacttctcagttatcttcaacggcgtatttgag



>NA splicing('2589..2774') : exon_id=AT1G21400.6.exon7
tgatgacatatagagtaggacatcattctacatcagatgattcaactaagtacagggcggcggatgaaatccagtactggaaaatgtcgagaaaccctgtgaatagatttcggaaatgggtcgaagataacggatggtggagtgaggaagatgaatccaagctaagatctaacgcaagaaaacagg
cttctt sequence not found in exon_id=AT1G21400.6.exon7
aagaag sequence not found in exon_id=AT1G21400.6.exon7
Exon Info: exon_id=AT1G21400.3.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.3.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.3.exon2
aagaag sequence not found in exon_id=AT1G21400.3.exon2
Exon Info: exon_id=AT1G21400.2.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.2.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.2.exon2
aagaag sequence

Exon Info: exon_id=AT4G27040.2.exon3


>NA splicing('complement(F10M23_4:122538..122685)') : exon_id=AT4G27040.2.exon3

cttctt sequence not found in exon_id=AT4G27040.2.exon3
aagaag sequence not found in exon_id=AT4G27040.2.exon3
Exon Info: exon_id=AT4G27040.4.exon4


>NA splicing('complement(F10M23_4:122754..122864)') : exon_id=AT4G27040.4.exon4

cttctt sequence not found in exon_id=AT4G27040.4.exon4
aagaag sequence not found in exon_id=AT4G27040.4.exon4


Number of features 117


 EMPIEZO A MAQUINEAR CON GFF

In [50]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Create an array to store CTTCTT and AAGAAG features
cttctt_features = []
aagaag_features = []

# Create a Bio::GFF::GFF3 object to store features
gff3 = Bio::GFF::GFF3.new

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i
    end_pos = Regexp.last_match[2].to_i

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[start_pos..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    cttctt_match = exon_sequence&.match(/cttctt/)
    if cttctt_match
      puts "cttctt sequence found in #{gene_info}"
      start_coord = start_pos + cttctt_match.begin(0)
      end_coord = start_pos + cttctt_match.end(0) - 1
      cttctt_features << Bio::Feature.new(
        'cttctt_repeat',
        "#{start_coord}..#{end_coord}",
        'cttctt_repeat'
      )
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence
    aagaag_match = exon_sequence&.match(/aagaag/)
    if aagaag_match
      puts "aagaag sequence found in #{gene_info}"
      start_coord = start_pos + aagaag_match.begin(0)
      end_coord = start_pos + aagaag_match.end(0) - 1
      aagaag_features << Bio::Feature.new(
        'aagaag_repeat',
        "#{start_coord}..#{end_coord}",
        'aagaag_repeat'
      )
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  # Add the new features to the GFF3 object
  cttctt_features.each { |feature| gff3.records << feature.to_gff3 }
  aagaag_features.each { |feature| gff3.records << feature.to_gff3 }

  puts "\n\nNumber of features #{entry.features.length}"
end

# Output GFF3-formatted content
puts gff3.to_s

# Output report for genes without CTTCTT repeats
genes_without_cttctt = entry.features.select { |feature| feature.feature == 'gene' && !feature.subfeatures.any? { |sub| sub.feature == 'cttctt_repeat' } }
puts "\nGenes without CTTCTT repeats:"
genes_without_cttctt.each do |gene|
  puts gene.assoc['gene']
end



chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
tgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcagata

NoMethodError: undefined method `to_gff3' for #<Bio::Feature:0x000055e2771cd130 @feature="cttctt_repeat", @position="1898..1903", @qualifiers="cttctt_repeat">

In [22]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Create an array to store CTTCTT and AAGAAG features
cttctt_features = []
aagaag_features = []

# Create a Bio::GFF::GFF3 object to store features
gff3 = Bio::GFF::GFF3.new

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i
    end_pos = Regexp.last_match[2].to_i

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[start_pos..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    cttctt_match = exon_sequence&.match(/cttctt/)
    if cttctt_match
      puts "cttctt sequence found in #{gene_info}"
      start_coord = start_pos + cttctt_match.begin(0)
      end_coord = start_pos + cttctt_match.end(0)
      cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=cttctt_repeat_#{start_coord}_#{end_coord}"
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence
    aagaag_match = exon_sequence&.match(/aagaag/)
    if aagaag_match
      puts "aagaag sequence found in #{gene_info}"
      start_coord = start_pos + aagaag_match.begin(0)
      end_coord = start_pos + aagaag_match.end(0) 
      aagaag_features << "#{entry.accession}\tcustom\taagaag_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=aagaag_repeat_#{start_coord}_#{end_coord}"
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end

# Output GFF3-formatted content
puts "##gff-version 3"
puts cttctt_features
puts aagaag_features

# Output report for genes without CTTCTT repeats
genes_without_cttctt = entry.features.select { |feature| feature.feature == 'gene' && !feature.subfeatures.any? { |sub| sub.feature == 'cttctt_repeat' } }
puts "\nGenes without CTTCTT repeats:"
genes_without_cttctt.each do |gene|
  puts gene.assoc['gene']
end


chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
tgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcagata

Exon Info: exon_id=AT5G54280.1.exon4


>NA splicing('complement(MDK4_5:46979..47129)') : exon_id=AT5G54280.1.exon4

cttctt sequence not found in exon_id=AT5G54280.1.exon4
aagaag sequence not found in exon_id=AT5G54280.1.exon4
Exon Info: exon_id=AT5G54280.1.exon9


>NA splicing('complement(MDK4_5:46032..46116)') : exon_id=AT5G54280.1.exon9

cttctt sequence not found in exon_id=AT5G54280.1.exon9
aagaag sequence not found in exon_id=AT5G54280.1.exon9
Exon Info: exon_id=AT5G54280.1.exon14


>NA splicing('complement(MDK4_5:44913..45002)') : exon_id=AT5G54280.1.exon14

cttctt sequence not found in exon_id=AT5G54280.1.exon14
aagaag sequence not found in exon_id=AT5G54280.1.exon14
Exon Info: exon_id=AT5G54280.1.exon6


>NA splicing('complement(MDK4_5:46551..46719)') : exon_id=AT5G54280.1.exon6

cttctt sequence not found in exon_id=AT5G54280.1.exon6
aagaag sequence not found in exon_id=AT5G54280.1.exon6
Exon Info: exon_id=AT5G54280.2.exon6


>NA splicing('complement(MDK4_5:46841..46868)') : exo

cttctt sequence found in exon_id=AT1G21400.2.exon1
aagaag sequence found in exon_id=AT1G21400.2.exon1
Exon Info: exon_id=AT1G21400.2.exon3


>NA splicing('1451..1633') : exon_id=AT1G21400.2.exon3
tgagtgagaaactcgctgttagaatgtacgaacaaatggcgacgctacaagtaatggatcacatcttctacgaagctcaacgtcaaggaagaatatctttttatcttacttccgtcggagaagaagccattaacatcgcttcagcagctgctctcagtcctgacgacgtcgttttacctcagg
cttctt sequence not found in exon_id=AT1G21400.2.exon3
aagaag sequence found in exon_id=AT1G21400.2.exon3
Exon Info: exon_id=AT1G21400.5.exon1


>NA splicing('1..513') : exon_id=AT1G21400.5.exon1
atttttttaatttttcaatagtatttgtccactataattagtgtaaaaactaaaaatcccaaaaagagaggaagataaataatcattttaagaaaagaaaaagaagttggagatggatacggatgacatgtcaagtccgacaaggaaatacatgcacttgtcgtccttctaatcctttgcccttaaacatcgttaaaaacgacaacttctctcttcttcttctctactcttctccattttcttctcttcttctcttctttgtccattttttgacagatatggcgatctggtttgctagatccaaaacccttgtttctagcttgagacataatctaaatttgtcgacaattctcataaaacgtgattactctcatcgtcccatcttctatacaacttctcagttatcttcaacggcgtatttgag



>NA splicing('2589..2774') : exon_id=AT1G21400.6.exon7
tgatgacatatagagtaggacatcattctacatcagatgattcaactaagtacagggcggcggatgaaatccagtactggaaaatgtcgagaaaccctgtgaatagatttcggaaatgggtcgaagataacggatggtggagtgaggaagatgaatccaagctaagatctaacgcaagaaaacagg
cttctt sequence not found in exon_id=AT1G21400.6.exon7
aagaag sequence not found in exon_id=AT1G21400.6.exon7
Exon Info: exon_id=AT1G21400.3.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.3.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.3.exon2
aagaag sequence not found in exon_id=AT1G21400.3.exon2
Exon Info: exon_id=AT1G21400.2.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.2.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.2.exon2
aagaag sequence

Exon Info: exon_id=AT4G27040.2.exon3


>NA splicing('complement(F10M23_4:122538..122685)') : exon_id=AT4G27040.2.exon3

cttctt sequence not found in exon_id=AT4G27040.2.exon3
aagaag sequence not found in exon_id=AT4G27040.2.exon3
Exon Info: exon_id=AT4G27040.4.exon4


>NA splicing('complement(F10M23_4:122754..122864)') : exon_id=AT4G27040.4.exon4

cttctt sequence not found in exon_id=AT4G27040.4.exon4
aagaag sequence not found in exon_id=AT4G27040.4.exon4


Number of features 117
##gff-version 3
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	1898	1904	.	+	.	ID=cttctt_repeat_1898_1904
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	257	263	.	+	.	ID=cttctt_repeat_257_263
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	257	263	.	+	.	ID=cttctt_repeat_257_263
chromosome:TAIR10:5:22038165:22039568:1	custom	cttctt_repeat	1098	1104	.	+	.	ID=cttctt_repeat_1098_1104
chromosome:TAIR10:1:7493213:7496741:1	custom	cttctt_repeat	213	219	.	+	.	ID=cttctt_repeat_2

NoMethodError: undefined method `subfeatures' for #<Bio::Feature:0x000055a680cee588 @feature="gene", @position="1..1258", @qualifiers=[#<Bio::Feature::Qualifier:0x000055a680ced5e8 @qualifier="gene", @value="AT4G27030">, #<Bio::Feature::Qualifier:0x000055a680ced3b8 @qualifier="locus_tag", @value="FADA">, #<Bio::Feature::Qualifier:0x000055a680ced098 @qualifier="note", @value="fatty acid desaturase A [Source:NCBI gene (formerly Entrezgene);Acc:828811]">]>

AQUÍ NO SE GENERA ARCHIVO. GFF

In [1]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Create an array to store CTTCTT and AAGAAG features
cttctt_features = []
aagaag_features = []

# Create a Bio::GFF::GFF3 object to store features
gff3 = Bio::GFF::GFF3.new

# Define entry outside the loop
entry = nil

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i
    end_pos = Regexp.last_match[2].to_i

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[start_pos..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    cttctt_match = exon_sequence&.match(/cttctt/)
    if cttctt_match
      puts "cttctt sequence found in #{gene_info}"
      start_coord = start_pos + cttctt_match.begin(0)
      end_coord = start_pos + cttctt_match.end(0) - 1
      cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=cttctt_repeat_#{start_coord}_#{end_coord}"
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence
    aagaag_match = exon_sequence&.match(/aagaag/)
    if aagaag_match
      puts "aagaag sequence found in #{gene_info}"
      start_coord = start_pos + aagaag_match.begin(0)
      end_coord = start_pos + aagaag_match.end(0) - 1
      aagaag_features << "#{entry.accession}\tcustom\taagaag_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=aagaag_repeat_#{start_coord}_#{end_coord}"
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end

# Output GFF3-formatted content
puts "##gff-version 3"
puts cttctt_features
puts aagaag_features

# Output report for genes without CTTCTT repeats
genes_without_cttctt = entry.features.reject { |feature| feature.feature == 'cttctt_repeat' }
puts "\nGenes without CTTCTT repeats:"
genes_without_cttctt.each do |gene|
  puts gene.assoc['gene']
end


chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
tgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcagata

Exon Info: exon_id=AT5G54280.1.exon4


>NA splicing('complement(MDK4_5:46979..47129)') : exon_id=AT5G54280.1.exon4

cttctt sequence not found in exon_id=AT5G54280.1.exon4
aagaag sequence not found in exon_id=AT5G54280.1.exon4
Exon Info: exon_id=AT5G54280.1.exon9


>NA splicing('complement(MDK4_5:46032..46116)') : exon_id=AT5G54280.1.exon9

cttctt sequence not found in exon_id=AT5G54280.1.exon9
aagaag sequence not found in exon_id=AT5G54280.1.exon9
Exon Info: exon_id=AT5G54280.1.exon14


>NA splicing('complement(MDK4_5:44913..45002)') : exon_id=AT5G54280.1.exon14

cttctt sequence not found in exon_id=AT5G54280.1.exon14
aagaag sequence not found in exon_id=AT5G54280.1.exon14
Exon Info: exon_id=AT5G54280.1.exon6


>NA splicing('complement(MDK4_5:46551..46719)') : exon_id=AT5G54280.1.exon6

cttctt sequence not found in exon_id=AT5G54280.1.exon6
aagaag sequence not found in exon_id=AT5G54280.1.exon6
Exon Info: exon_id=AT5G54280.2.exon6


>NA splicing('complement(MDK4_5:46841..46868)') : exo

cttctt sequence found in exon_id=AT1G21400.2.exon1
aagaag sequence found in exon_id=AT1G21400.2.exon1
Exon Info: exon_id=AT1G21400.2.exon3


>NA splicing('1451..1633') : exon_id=AT1G21400.2.exon3
tgagtgagaaactcgctgttagaatgtacgaacaaatggcgacgctacaagtaatggatcacatcttctacgaagctcaacgtcaaggaagaatatctttttatcttacttccgtcggagaagaagccattaacatcgcttcagcagctgctctcagtcctgacgacgtcgttttacctcagg
cttctt sequence not found in exon_id=AT1G21400.2.exon3
aagaag sequence found in exon_id=AT1G21400.2.exon3
Exon Info: exon_id=AT1G21400.5.exon1


>NA splicing('1..513') : exon_id=AT1G21400.5.exon1
atttttttaatttttcaatagtatttgtccactataattagtgtaaaaactaaaaatcccaaaaagagaggaagataaataatcattttaagaaaagaaaaagaagttggagatggatacggatgacatgtcaagtccgacaaggaaatacatgcacttgtcgtccttctaatcctttgcccttaaacatcgttaaaaacgacaacttctctcttcttcttctctactcttctccattttcttctcttcttctcttctttgtccattttttgacagatatggcgatctggtttgctagatccaaaacccttgtttctagcttgagacataatctaaatttgtcgacaattctcataaaacgtgattactctcatcgtcccatcttctatacaacttctcagttatcttcaacggcgtatttgag



>NA splicing('2589..2774') : exon_id=AT1G21400.6.exon7
tgatgacatatagagtaggacatcattctacatcagatgattcaactaagtacagggcggcggatgaaatccagtactggaaaatgtcgagaaaccctgtgaatagatttcggaaatgggtcgaagataacggatggtggagtgaggaagatgaatccaagctaagatctaacgcaagaaaacagg
cttctt sequence not found in exon_id=AT1G21400.6.exon7
aagaag sequence not found in exon_id=AT1G21400.6.exon7
Exon Info: exon_id=AT1G21400.3.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.3.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.3.exon2
aagaag sequence not found in exon_id=AT1G21400.3.exon2
Exon Info: exon_id=AT1G21400.2.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.2.exon2
aactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.2.exon2
aagaag sequence

Exon Info: exon_id=AT4G27040.2.exon3


>NA splicing('complement(F10M23_4:122538..122685)') : exon_id=AT4G27040.2.exon3

cttctt sequence not found in exon_id=AT4G27040.2.exon3
aagaag sequence not found in exon_id=AT4G27040.2.exon3
Exon Info: exon_id=AT4G27040.4.exon4


>NA splicing('complement(F10M23_4:122754..122864)') : exon_id=AT4G27040.4.exon4

cttctt sequence not found in exon_id=AT4G27040.4.exon4
aagaag sequence not found in exon_id=AT4G27040.4.exon4


Number of features 117
##gff-version 3
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	1898	1903	.	+	.	ID=cttctt_repeat_1898_1903
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	257	262	.	+	.	ID=cttctt_repeat_257_262
chromosome:TAIR10:2:19022154:19027528:1	custom	cttctt_repeat	257	262	.	+	.	ID=cttctt_repeat_257_262
chromosome:TAIR10:5:22038165:22039568:1	custom	cttctt_repeat	1098	1103	.	+	.	ID=cttctt_repeat_1098_1103
chromosome:TAIR10:1:7493213:7496741:1	custom	cttctt_repeat	213	218	.	+	.	ID=cttctt_repeat_2

[#<Bio::Feature:0x0000559a645e4a08 @feature="source", @position="1..1258", @qualifiers=[#<Bio::Feature::Qualifier:0x0000559a645e47b0 @qualifier="organism", @value="Arabidopsis thaliana">, #<Bio::Feature::Qualifier:0x0000559a645e46e8 @qualifier="db_xref", @value="taxon:3702">]>, #<Bio::Feature:0x0000559a645e4698 @feature="gene", @position="1..1258", @qualifiers=[#<Bio::Feature::Qualifier:0x0000559a645e4418 @qualifier="gene", @value="AT4G27030">, #<Bio::Feature::Qualifier:0x0000559a645e4328 @qualifier="locus_tag", @value="FADA">, #<Bio::Feature::Qualifier:0x0000559a645e4260 @qualifier="note", @value="fatty acid desaturase A [Source:NCBI gene (formerly Entrezgene);Acc:828811]">]>, #<Bio::Feature:0x0000559a645e4210 @feature="mRNA", @position="1..1258", @qualifiers=[#<Bio::Feature::Qualifier:0x0000559a6465fdc0 @qualifier="gene", @value="AT4G27030">, #<Bio::Feature::Qualifier:0x0000559a6465f2d0 @qualifier="standard_name", @value="AT4G27030.1">]>, #<Bio::Feature:0x0000559a6465f230 @feature="C

AQUÍ EL ARCHIVO SE GENERA, ANTES NO

AHORA INTENTO HACER QUE EL .GFF GENERADO META LAS CTTCTT EN LA MISMA STRAND DEL GEN, Y LAS AAGAAG EN EL STRAND REVERSO DEL GEN.

In [70]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Create an array to store cttctt_repeat features
cttctt_features = []

# Create a Bio::GFF::GFF3 object to store features
gff3 = Bio::GFF::GFF3.new

# Define entry outside the loop
entry = nil

# Track processed features to avoid duplicates
processed_features = Set.new

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i 
    end_pos = Regexp.last_match[2].to_i 

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts " "
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[(start_pos-1)..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    cttctt_match = exon_sequence&.match(/cttctt/)
    if cttctt_match
      puts "cttctt sequence found in #{gene_info}"
      start_coord = start_pos + cttctt_match.begin(0) 
      end_coord = start_pos + cttctt_match.end(0) - 1
      feature_id = "cttctt_repeat_#{start_coord}_#{end_coord}"
      unless processed_features.include?(feature_id)
        cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=#{feature_id}"
        processed_features.add(feature_id)
      end
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence and transform it into cttctt_repeat on complementary strand
    aagaag_match = exon_sequence&.match(/aagaag/)
    if aagaag_match
      puts "aagaag sequence found in #{gene_info}"

      # Adjust positions for cttctt_repeat features on the complementary strand
      start_coord = start_pos  + aagaag_match.begin(0)
      end_coord = start_pos  + aagaag_match.end(0) -1 
      puts "#######"
      puts start_pos
      puts "########"
      puts aagaag_match.begin(0)
      feature_id = "cttctt_repeat_#{start_coord}_#{end_coord}"
      unless processed_features.include?(feature_id)
        cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t-\t.\tID=#{feature_id}"
        processed_features.add(feature_id)
      end
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end

# Output GFF3-formatted content to a file
output_filename = 'output.gff3'
File.open(output_filename, 'w') do |file|
  file.puts "##gff-version 3"
  cttctt_features.each { |line| file.puts line }
end

# Output report for genes without CTTCTT repeats
genes_without_cttctt = entry.features.reject { |feature| feature.feature == 'cttctt_repeat' }
puts "\nGenes without CTTCTT repeats:"
genes_without_cttctt.each do |gene|
  puts gene.assoc['gene']
end

puts "\nGFF3 file saved as #{output_filename}"



chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
 
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
ctgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcag

Errno::EISDIR: Is a directory @ io_fread - genes_embl/.ipynb_checkpoints

NOW LOOKING FOR SEVERAL MATCHES

In [66]:
# Read existing GFF3 file
input_filename = 'output.gff3'
gff3_lines = File.readlines(input_filename)

# Output GFF3-formatted content to a file
output_filename = 'output_corrected.gff3'
File.open(output_filename, 'w') do |file|
  file.puts "##gff-version 3"

  # Process each line in the input GFF3 file
  gff3_lines.each do |line|
    next if line.start_with?('#')  # Skip comment lines

    # Split the line into fields
    fields = line.chomp.split("\t")

    # Extract relevant fields
    chromosome = fields[0]
    feature = fields[2]
    start_pos_repeat = fields[3].to_i
    end_pos_repeat = fields[4].to_i

    # Extract the gene start position from the first column
    gene_start_pos = chromosome.split(':')[3].to_i

    # Calculate new start and end coordinates
    start_coord_repeat = gene_start_pos + start_pos_repeat - 1
    end_coord_repeat = gene_start_pos + end_pos_repeat - 1

    
    # Extract the chromosome number from the first column
    chromosome_number = chromosome.split(':')[2].to_i

    # Update the fields and write to the output file
    fields[0] = chromosome_number  # Extract chromosome number
    fields[3] = start_coord_repeat.to_s
    fields[4] = end_coord_repeat.to_s

    file.puts fields.join("\t")
  end
end

puts "\nGFF3 file saved as #{output_filename}"




GFF3 file saved as output_corrected.gff3


THIS ONE IS TO COUNT:(no hecho)