In [2]:
require 'bio'
require 'fileutils'

# Directory where EMBL files are stored
embl_directory = 'genes_embl'

# Create an array to store cttctt_repeat features
cttctt_features = []

# Create a Bio::GFF::GFF3 object to store features
gff3 = Bio::GFF::GFF3.new

# Define entry outside the loop
entry = nil

# Track processed features to avoid duplicates
processed_features = Set.new

# Iterate over each file in the directory
Dir.foreach(embl_directory) do |filename|
  next if filename == '.' || filename == '..'

  # Full path to the EMBL file
  embl_file_path = File.join(embl_directory, filename)

  # Read the contents of the EMBL file
  embl_contents = File.read(embl_file_path)

  # Create a Bio::EMBL object from the file contents
  entry = Bio::EMBL.new(embl_contents)

  # Print some information about the entry
  puts "#{entry.accession}"
  puts "Definition: #{entry.definition}"

  # Iterate over each feature in the entry
  entry.features.each do |feature|
    next unless feature.feature == 'exon' # Filter for exon features

    position = feature.position

    # Check if position is valid
    next unless position =~ /(\d+)\.\.(\d+)/
    start_pos = Regexp.last_match[1].to_i 
    end_pos = Regexp.last_match[2].to_i 

    # Get gene information and join it into a string
    gene_info = [
      feature.assoc['gene'], feature.assoc['product'],
      feature.assoc['note'], feature.assoc['function']
    ].compact.join(', ')
    puts " "
    puts "Exon Info: #{gene_info}"

    # Show nucleic acid sequence of the exon
    exon_sequence = entry.naseq[(start_pos-1)..end_pos]
    puts "\n\n>NA splicing('#{position}') : #{gene_info}"
    puts exon_sequence

    # Scan for "cttctt" sequence
    cttctt_match = exon_sequence&.match(/cttctt/)
    if cttctt_match
      puts "cttctt sequence found in #{gene_info}"
      start_coord = start_pos + cttctt_match.begin(0) 
      end_coord = start_pos + cttctt_match.end(0) - 1
      feature_id = "cttctt_repeat_#{start_coord}_#{end_coord}"
      unless processed_features.include?(feature_id)
        cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t+\t.\tID=#{feature_id}"
        processed_features.add(feature_id)
      end
    else
      puts "cttctt sequence not found in #{gene_info}"
    end

    # Scan for "aagaag" sequence and transform it into cttctt_repeat on complementary strand
    aagaag_match = exon_sequence&.match(/aagaag/)
    if aagaag_match
      puts "aagaag sequence found in #{gene_info}"

      # Adjust positions for cttctt_repeat features on the complementary strand
      start_coord = start_pos  + aagaag_match.begin(0)
      end_coord = start_pos  + aagaag_match.end(0) -1 

      feature_id = "cttctt_repeat_#{start_coord}_#{end_coord}"
      unless processed_features.include?(feature_id)
        cttctt_features << "#{entry.accession}\tcustom\tcttctt_repeat\t#{start_coord}\t#{end_coord}\t.\t-\t.\tID=#{feature_id}"
        processed_features.add(feature_id)
      end
    else
      puts "aagaag sequence not found in #{gene_info}"
    end
  end

  puts "\n\nNumber of features #{entry.features.length}"
end

# Output GFF3-formatted content to a file
output_filename = 'output.gff3'
File.open(output_filename, 'w') do |file|
  file.puts "##gff-version 3"
  cttctt_features.each { |line| file.puts line }
end

# Output report for genes without CTTCTT repeats
genes_without_cttctt = entry.features.reject { |feature| feature.feature == 'cttctt_repeat' }
puts "\nGenes without CTTCTT repeats:"
genes_without_cttctt.each do |gene|
  puts gene.assoc['gene']
end

puts "\nGFF3 file saved as #{output_filename}"

chromosome:TAIR10:2:19022154:19027528:1
Definition: Arabidopsis thaliana chromosome 2 TAIR10 partial sequence 19022154..19027528 annotated by Araport11
 
Exon Info: exon_id=AT2G46340.2.exon4


>NA splicing('complement(1743..2553)') : exon_id=AT2G46340.2.exon4
ctgaactacaccatcgtagtcagtcgaggccaagtagtttttgatgtagctattccagcaaacacagcttaatttagatttgttgaccatctctactagtggataatgaacaccgacagattcattcataaatgcattgaagtcaaaaatcttgatcttctttgatatcccagctgctgctatgtgttcttcatcagggtcaaaactcaacgagcagaccacacttgcagagtttaaaaggtccccacttcttattgtcccacaggtttcgaacttgctataccgagcaaatttgcacaacccctcgaaaaacacttcgagttgatctgaagattttcctttggtactcatatcctgattttcattttggttttcagagcatctgtccctgtcctttagtgttttgtcagaacgggcagtagcagcagaactcgataagttgatttgtgatcgcatgaaaaaatatgcatcttcaagttgacggatattactcatcagcctgtctgtattagcagttggtacaaacaaggcactggaagttgtacagtgctcgtctaggggagatgattgcacccttttttcaatagctccatgagatctcaccagagatacatttgaagaatatcttcgctcggcctccttaatatcatcctctaaggtctggatatcttgcaaaagtttagatgcttttttcttcttctgcacttctaatgaagacaaaaaatgaagtaacagctcag

aagaag sequence not found in exon_id=AT5G54280.1.exon18
 
Exon Info: exon_id=AT5G54280.1.exon4


>NA splicing('complement(MDK4_5:46979..47129)') : exon_id=AT5G54280.1.exon4

cttctt sequence not found in exon_id=AT5G54280.1.exon4
aagaag sequence not found in exon_id=AT5G54280.1.exon4
 
Exon Info: exon_id=AT5G54280.1.exon9


>NA splicing('complement(MDK4_5:46032..46116)') : exon_id=AT5G54280.1.exon9

cttctt sequence not found in exon_id=AT5G54280.1.exon9
aagaag sequence not found in exon_id=AT5G54280.1.exon9
 
Exon Info: exon_id=AT5G54280.1.exon14


>NA splicing('complement(MDK4_5:44913..45002)') : exon_id=AT5G54280.1.exon14

cttctt sequence not found in exon_id=AT5G54280.1.exon14
aagaag sequence not found in exon_id=AT5G54280.1.exon14
 
Exon Info: exon_id=AT5G54280.1.exon6


>NA splicing('complement(MDK4_5:46551..46719)') : exon_id=AT5G54280.1.exon6

cttctt sequence not found in exon_id=AT5G54280.1.exon6
aagaag sequence not found in exon_id=AT5G54280.1.exon6
 
Exon Info: exon_id=AT5G542

cttctt sequence found in exon_id=AT1G21400.2.exon1
aagaag sequence found in exon_id=AT1G21400.2.exon1
 
Exon Info: exon_id=AT1G21400.2.exon3


>NA splicing('1451..1633') : exon_id=AT1G21400.2.exon3
gtgagtgagaaactcgctgttagaatgtacgaacaaatggcgacgctacaagtaatggatcacatcttctacgaagctcaacgtcaaggaagaatatctttttatcttacttccgtcggagaagaagccattaacatcgcttcagcagctgctctcagtcctgacgacgtcgttttacctcagg
cttctt sequence not found in exon_id=AT1G21400.2.exon3
aagaag sequence found in exon_id=AT1G21400.2.exon3
 
Exon Info: exon_id=AT1G21400.5.exon1


>NA splicing('1..513') : exon_id=AT1G21400.5.exon1
gatttttttaatttttcaatagtatttgtccactataattagtgtaaaaactaaaaatcccaaaaagagaggaagataaataatcattttaagaaaagaaaaagaagttggagatggatacggatgacatgtcaagtccgacaaggaaatacatgcacttgtcgtccttctaatcctttgcccttaaacatcgttaaaaacgacaacttctctcttcttcttctctactcttctccattttcttctcttcttctcttctttgtccattttttgacagatatggcgatctggtttgctagatccaaaacccttgtttctagcttgagacataatctaaatttgtcgacaattctcataaaacgtgattactctcatcgtcccatcttctatacaacttctcagttatcttcaacggcgta

 
Exon Info: exon_id=AT1G21400.6.exon7


>NA splicing('2589..2774') : exon_id=AT1G21400.6.exon7
atgatgacatatagagtaggacatcattctacatcagatgattcaactaagtacagggcggcggatgaaatccagtactggaaaatgtcgagaaaccctgtgaatagatttcggaaatgggtcgaagataacggatggtggagtgaggaagatgaatccaagctaagatctaacgcaagaaaacagg
cttctt sequence not found in exon_id=AT1G21400.6.exon7
aagaag sequence not found in exon_id=AT1G21400.6.exon7
 
Exon Info: exon_id=AT1G21400.3.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.3.exon2
gaactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not found in exon_id=AT1G21400.3.exon2
aagaag sequence not found in exon_id=AT1G21400.3.exon2
 
Exon Info: exon_id=AT1G21400.2.exon2


>NA splicing('1232..1369') : exon_id=AT1G21400.2.exon2
gaactggatttcccaggaggcaaagtcggttacacatcggagatgaaattcataccggaatcatcttcaaggaggattccatgttaccgggttcttgacgaagacggacgaatcatccccgatagcgattttattccgg
cttctt sequence not foun


cttctt sequence not found in exon_id=AT4G27040.3.exon1
aagaag sequence not found in exon_id=AT4G27040.3.exon1
 
Exon Info: exon_id=AT4G27040.2.exon3


>NA splicing('complement(F10M23_4:122538..122685)') : exon_id=AT4G27040.2.exon3

cttctt sequence not found in exon_id=AT4G27040.2.exon3
aagaag sequence not found in exon_id=AT4G27040.2.exon3
 
Exon Info: exon_id=AT4G27040.4.exon4


>NA splicing('complement(F10M23_4:122754..122864)') : exon_id=AT4G27040.4.exon4

cttctt sequence not found in exon_id=AT4G27040.4.exon4
aagaag sequence not found in exon_id=AT4G27040.4.exon4


Number of features 117

Genes without CTTCTT repeats:

AT4G27030
AT4G27030
AT4G27030
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040
AT4G27040





































































































GFF3 file saved as output.gff3


In [9]:
# Read existing GFF3 file
input_filename = 'prueba2.gff3'
gff3_lines = File.readlines(input_filename)

# Output GFF3-formatted content to a file
output_filename = 'prueba_chromosome.gff3'
File.open(output_filename, 'w') do |file|
  file.puts "##gff-version 3"

  # Process each line in the input GFF3 file
  gff3_lines.each do |line|
    next if line.start_with?('#')  # Skip comment lines

    # Split the line into fields
    fields = line.chomp.split("\t")

    # Extract relevant fields
    chromosome = fields[0]
    feature = fields[2]
    start_pos_repeat = fields[3].to_i
    end_pos_repeat = fields[4].to_i

    # Extract the gene start position from the first column
    gene_start_pos = chromosome.split(':')[3].to_i

    # Calculate new start and end coordinates
    start_coord_repeat = gene_start_pos + start_pos_repeat - 1
    end_coord_repeat = gene_start_pos + end_pos_repeat - 1

    
    # Extract the chromosome number from the first column
    chromosome_number = chromosome.split(':')[2].to_i

    # Update the fields and write to the output file
    fields[0] = chromosome_number  # Extract chromosome number
    fields[3] = start_coord_repeat.to_s
    fields[4] = end_coord_repeat.to_s

    file.puts fields.join("\t")
  end
end

puts "\nGFF3 file saved as #{output_filename}"



GFF3 file saved as prueba_chromosome.gff3
