Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
120 lines (115 sloc) 7.05 KB
#!/usr/bin/perl
use warnings;
use strict;
my $seq ="AGACAAGTCGGACGTTTCATCTGAGGGTTCTTCTGCCTCCGCACTTGGTGCACATCAGACAAGGCAATCA
TGGGGGACGCTCAGATGGCAGAGTTTGGAGCAGCAGCTTCTTACCTGCGAAAGTCAGATCGAGAGCGTCT
GGAAGCACAAACCCGTCCCTTTGATATGAAAAAGGAGTGTTTTGTGCCTGATCCAGATGAAGAGTATGTA
AAAGCTTCAATCGTCAGTCGTGAAGGTGACAAAGTCACTGTACAGACTGAGAAAAGAAAGACTGTAACTG
TAAAGGAAGCTGACATTCACCCCCAGAACCCTCCAAAGTTTGATAAAATTGAAGACATGGCAATGTTCAC
CTTCCTTCATGAGCCAGCCGTGCTGTTCAACCTCAAAGAGCGCTATGCAGCATGGATGATCTATACCTAC
TCAGGACTGTTTTGTGTCACTGTCAACCCCTACAAGTGGCTGCCGGTGTACAATCAGGAGGTGGTTGTAG
CCTATAGAGGGAAAAAGAGGAGTGAAGCTCCTCCCCACATCTTTTCCATCTCTGATAACGCCTATCAGTA
CATGCTAACAGACAGGGAAAATCAGTCAATTCTGATCACTGGAGAATCGGGTGCAGGAAAGACTGTGAAC
ACAAAGAGAGTCATTCAGTATTTTGCAAGTATTGCAGCTGGTGGCTCAGCAAAGAAAGAAGGGGCAGAAA
AAAAGGGAACCCTGGAGGATCAAATCATCCAGGCTAACCCTGCTTTGGAAGCTTTTGGCAATGCAAAGAC
CATCAGAAATGACAACTCCTCAAGATTTGGAAAATTCATCCGAATTCATTTTGGTGCCAGTGGAAAGTTG
GCATCAGCTGACATTGAAACATATCTTCTGGAGAAGTCAAGGGTGACTTTTCAGCTTAAAGCTGAAAGGG
ACTATCATATTTTCTACCAAATCCTCTCTCAGCGGAAACCTGAGCTGCTGGAGATGCTGCTCATCACCAA
CAACCCCTATGACTACGCTTACATCTCCCAAGGAGAAACAACCGTAGCTTCAATTAATGATGGTGAAGAA
TTGCTTGCCACTGATGAAGCGTTTGATGTACTAGGCTTCACTCAGGAGGAGAAAAATGGCATCTATAAAT
TGATTGGAGCCATTATGCACTTTGGCAATATGAAGTTCAAACAGAAGCAGAGGGAGGAGCAAGCAGAGGC
TGATGGAACTGAAGATGGAGACAAAGTCGCATATCTGATGGGCCTAAACTCTGCTGACCTTATCAAGGGT
CTGTGCCACCCAAGAGTCAAAGTAGGAAATGAGTGGGTCACCAAAGGACAAAATGTCCAGCAGGTGTACT
ACGCTATTGGTGCACTAGCCAAGTCAGTGTATGAAAAGATGTTCCTTTGGATGGTTGTAAGAATCAATCA
GTCTCTAGACACCAAACAACCACGCCAATACTTCATTGGAGTGCTGGACATTGCTGGCTTTGAGATCTTT
GATTTTAACACCTTTGAGCAACTCTGCATCAACTTTACCAATGAGAAACTGCAACAGTTTTTCAACCACC
ACATGTTTGTGCTGGAACAAGAGGAGTACAAGAAAGAAGGGATTGAATGGGAGTTTATTGACTTTGGCAT
GGACTTGCAGGCCTGCATCGATCTCATTGAGAAACCTATGGGCATCATGTCCATCCTTGAAGAGGAGTGC
ATGTTTCCCAAAGCAAGTGATTCAACCTTTAAAGCAAAGCTTTATGACAACCATCTTGGGAAATCAAATA
ACTTCCAGAAACCAAGAGCAATCAAAGGGAAGCCAGAGTCTCATTTTTCTCTGGTCCACTATGCTGGTAC
AGTTGACTATAATATCAACAACTGGCTGGTGAAGAACAAAGACCCATTAAATGAGACTGTGGTGGGACTC
TTTCAAAAGTCTACAGTCAAACTTCTGTCAATGCTCTTTGCTAACTATGCAGGGACAGAATCAGATAATG
GTAAGGGAGGTAAAGGAGGTGGAAGTAAGAAGAAGGGCTCCTCCTTCCAGACTGTGTCTGCACTCCACAG
GGAAAACTTAAATAAGTTAATGACAAACCTAAGGTCAACTCACCCTCATTTTGTGCGCTGCATCATTCCT
AATGAGACAAAGACTCCTGGTGCAATGGAGAATCCTTTGGTCATGCATCAGCTGCGCTGTAATGGTGTGC
TGGAGGGCATCAGGATTTGCAGGAAGGGCTTCCCCAACAGAATCCTCTATGGGGACTTCAAACAGAGGTA
CCGAATCCTAAATCCTGCAGCCATACCTGAAGGTCAGTTCATAGACAGCAGGAAAGGAGCAGAGAAACTG
TTGGGTTCACTGGATATTGATCACAATCAATATAAATTTGGACACACAAAGGTGTTCTTCAAGGCTGGTT
TACTCGGTCAGCTTGAAGAAATGAGAGATGACAGACTATCTCTAATTATTTCTGGAATTCAGGCAAGATC
CAGAGGACTTCTTGCAAGGGTTGAGTTCCAAAAGATAGTTGAAAGAAGGGATGCCCTACTGGTTATCCAG
TGGAATGTCCGTGCCTTCATGGGGGTGAAAAATTGGCCCTGGATGAAGCTTTTCTTCAAGATAAAACCTC
TTCTCAAGTCAGCAGAAGCAGAGAAAGAGATGGCAAATATGAAAGATGAATTTGCCAAGCTCAAAGAGGC
TTATGCTAAATCCGAAGCGAGAAGGAAAGAGCTAGAAGAAAAAATGGTGTCTCTTCTCCAAGAGAAGAAT
GACCTACAACTTCAAGTTCAAGCGGAGCAAGACAATCTCTGTGATGCAGAGGAACGATGTGACCAGCTCA
TCAAAAACAAGATTCAGCTTGAGGCTAAAGCCAAAGAGCTCACCGAGCGACTTGAGGATGAGGAGGAGAT
GAATGCAGAGCTGACAGCTAAGAAGAGAAAGCTGGAGGACGAATGCTCTGAGCTGAAGAAGGATATTGAT
GATCTGGAGCTCACTCTGGCTAAAGTCGAGAAAGAGAAGCATGCTACTGAGAACAAGGTAAAGAACCTGA
CAGAAGAAATGGCAGCTTTGGACGACATAATCGCAAAGCTGACCAAAGAGAAGAAAGCCTTGCAAGAAGC
TCATCAGCAGACACTGGATGACCTGCAGAGTGAGGAGGACAAAGTCAACACCCTCACCAAGGCAAAAGCA
AAGCTAGAGCAACAAGTAGATGATCTGGAAGGATCTCTTGAGCAAGAAAAGAAGCTCCGCATGGATCTAG
AAAGAGCCAAGAGGAAACTAGAAGGAGACTTGAAATTAACCCAGGAAAGCCTAATGGACCTGGAAAATGA
CAAGCAGCAGTTAGAGGAGCGTCTAAAAAAGAAAGACTTTGAAATCAGTCAGCTCAATGGGAAAATCGAA
GACGAACAAACTATTTGCATTCAGCTGCAGAAAAAACTGAAGGAACTTCAGGCACGTATTGAGGAGCTGG
AGGAAGAGCTTGAGGCAGAAAGAGCTGCTAGAGCCAAAGTGGAGAAACAGAGAGCAGATTTAGCCAGAGA
GCTGGAGGAGATCAGCGAGAGACTGGAGGAGGCTGGAGGAGCTACAGCTGCTCAGATTGAGATGAATAAG
AAACGAGAGGCAGAGTTTCAGAAGCTCCGCAGAGACCTTGAAGAGGCCACTCTGCAGCATGAGGCCACTG
CCGCCACACTCAGGAAAAAACAAGCCGACAGTGTGGCTGAACTTGGAGAGCAGATAGACAATCTGCAGAG
GGTCAAGCAAAAACTGGAGAAGGAGAAAAGTGAACTTAGGCTGGAGTTGGATGATGTAGTCTCAAACATG
GAACATGTTGTAAAGACAAAGGCAAATCTTGAGAAGATGACCAGATCTTTAGAAGACCAAATGAATGAAT
ATAAAACAAAATATGAGGAAGGTCAGCGCTGCATTAATGACTTCACAATGCAGAAATCTAAACTACAATC
TGAAAATGGTGAACTTTCAAGACAGCTGGAGGAAAAGGACTCTCTTGTCTCCCAGCTAACCAGAAGCAAG
ATGTCTTACACTCAGCAAATTGAAGATCTTAAAAGACAACTGGAGGAGGAAACAAAGGCAAAAAGCGCTC
TCGCCCATGCTGTACAGTCAGCCCGTCATGACACAGATCTGCTTAGAGAGCAGTATGAGGAGGAGCAGGA
AGCTAAAGCAGAGCTACAGCGAGGCATGTCCAAAGCTAATTCTGAGGTGGCACAGTGGAGAACCAAGTAC
GAAACTGATGCCATCCAGAGAACTGAAGAACTGGAGGAAGCCAAAAAGAAACTGGCTCAACGCTTACAGG
AAACCGAAGAAGCTGTTGAAGCAGTAAATGCAAAGTGTTCATCTCTTGAAAAGACCAAACACAGACTCCA
AAATGAGATTGAAGATCTTATGGTGGACCTGGAGAGGTCTAATGCGGCTGCTGCAGCCTTAGACAAAAAG
CAAAGAAACTTTGATAAGGTACTGTCTGAGTGGAAGCAGAAGTTTGAAGAGTCGCAAGCCGAGTTAGAGA
GCTCTCAGAAAGAAGCAAGATGTCTTAGCACTGAACTTTTCAAGCTGAAGAACTCCTATGAGGAAGCTTT
AGATCACCTGGAGACCATGAAGAGAGAAAATAAAAATCTCCAAGAGGAGATTTCTGATCTCACCGAGCAA
CTTGGTGAGGGAGGAAAGAGCATCCATGAGCTGGAGAAAATGAGGAAACAGTTGGAGCAAGAAAAAAGTG
AGATTCAATCTGCTCTGGAAGAGGCAGAGGCATCACTGGAGCACGAGGAGGGTAAGATTCTGCGAGCCCA
GCTGGAGTTCAGCCAAATTAAGGCTGATATCGAGCGCAAACTAGCCGAGAAGGATGAAGAGATGGAGCAG
AGCAAACGCAATTTGCAGAGGACCATTGACACTCTGCAAAGCTCCTTGGAGTCAGAAACCAGAAGCAGAA
ATGAGGCCCTCAGAATAAAAAAGAAGATGGAGGGCGACCTGAATGAGATGGAGATCCAGCTTAGTCAGGC
AAACCGACAAGCAGCAGAGGCCCAAAAACAACTTAAGAGTGTGCATGCACATATGAAAGATGCTCAGCTT
CAGCTGGACGACTCCCTGAGAACAAATGAAGATCTTAAGGAGAACACAGCCATTGTAGAGAGACGCAACA
ACCTTCTGCAGGCTGAACTAGAGGAACTCAGAGCAGCTCTTGAGCAAACCGAAAGAGGCCGTAAGCTTGC
TGAGCAGGAGCTTCTGGATACCAGTGAAAGAGTACAGCTGCTGCACTCCCAGAACACAAGCCTGTTAAAT
CAGAAGAAGAAGCTGGAGACGGATATATCCCAGCTTCAGACAGAAGTGGAAGAGGCAGTGCAAGAATGCA
GGAATGCTGAGGAAAAAGCCAAGAAGGCCATCACTGATGCTGCCATGATGGCGGAGGAGCTGAAGAAGGA
GCAGGATACAAGTGCTCACCTGGAGAGGATGAAGAAGAACATGGAGCAGACCATTAAAGACCTGCAGCAT
CGCCTGGATGAAGCAGAACAAATCGCTATGAAGGGAGGCAAGAAACAAGTCCAGAAACTGGAGGCCAGGG
TGAGGGAGCTGGAGAGTGAAGTTGAATCAGAACAGAAGAAGAGCAGTGAGGCGGTGAAAGGAATCCGCAA
ATATGAGAGACGTATAAAAGAACTGACGTATCAGACTGAGGAGGACCGCAAGAATCTGGCTCGTCTGCAA
GATCTGGTTGACAAGCTTCAGCTAAAGGTTAAAGCTTACAAGAGGGCTGCAGAGGAAGCTGAGGAACAAG
CCAACACTAATCTTAGCAAGTTCCGGAAAATCCAGCATGAGCTTGATGAGGCAGAAGAAAGAGCTGACAT
TGCAGAGTCACAAGTCAATAAGCTACGTGCCAAAAGTCGTGATGTCAGTTCTAAGAAGGGACATGATCAA
GAGTAAAGCTCAAGTGGATTTTCTGTGTCTCCGTTATGCTGAATTAGTTTTGTTTTCAGCCTATCTTGCA
TTTCTTCGGTCACTTAGTAGAATAAAGTTGAATTGCATTAA";
print "Length of sequence before whitespace removed ", length($seq),"\n";
$seq =~ s/\s+//g;
my $len = length($seq); # length of sequence
print "Length of our sequence is ", length($seq), "\n";
print "Number of codons (wrong) is ", $len / 3, "\n";
my $start_codon = index($seq, 'ATG'); # find the start codon by index
print "ATG (Start) is at $start_codon (in seq coordinates ",$start_codon+1,")\n";
print "The 3'UTR is ", $start_codon, " bases long\n";
my $stop_codon;
for( my $i = $start_codon + 3; $i< $len; $i += 3) {
my $codon = substr($seq, $i, 3);
if( $codon eq 'TAA' || $codon eq 'TAG' || $codon eq 'TGA' ) {
# This is a stop codon
$stop_codon = $i;
last;
}
}
my $cds_length = $stop_codon - $start_codon;
if( $cds_length % 3 == 0 ) {
print "This is definitely div by 3!\n";
} else {
print "Something is wrong, $cds_length is not div by 3!\n";
}
print "CDS is from ",$start_codon+1, "..", $stop_codon+3,"\n";
print "CDS length is ", $cds_length, " and the number of codons is ", $cds_length / 3, "\n";
print "3'UTR length is ", $len - ($stop_codon + 3), "\n";