Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

some analyses - may get move elsewhere - focused on polyA site

  • Loading branch information...
commit 94587add5025d7fb6034e9751191b62cdcbcac25 1 parent c890833
@hyphaltip authored
View
22 analyses/examine_trimmed.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl -w
+use strict;
+use Bio::SeqIO;
+my $min_diff = 4;
+my $orig = Bio::SeqIO->new(-format => 'fasta',
+ -file => shift @ARGV);
+
+
+my $trim = Bio::SeqIO->new(-format => 'fasta',
+ -file => shift @ARGV);
+
+my %len;
+while(my $seq = $orig->next_seq ) {
+ $len{$seq->id} = $seq->length;
+}
+
+while(my $seq = $trim->next_seq ) {
+ if( abs($seq->length - $len{$seq->id}) > $min_diff ) {
+ print join("\t", $seq->id, $len{$seq->id}, $seq->length), "\n";
+ }
+}
+
View
92 analyses/find_polyA_from_exonerate.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/perl -w
+use strict;
+use Bio::SeqIO;
+use Bio::SearchIO;
+use Bio::DB::Fasta;
+use Bio::Perl qw(revcom_as_string);
+
+my $window = 50;
+
+my $trimseq = shift || 'Ccin_hyphaltip_RNAseq.trinity.trimpoly.seqs';
+my $alnfolder = shift || 'alns';
+my $genome = shift || 'coprinopsis_cinerea_okayama7_genome.fasta';
+my $db = Bio::DB::Fasta->new($genome);
+
+my $in = Bio::SeqIO->new(-format => 'fasta',
+ -file => $trimseq);
+my $out = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">exonerate_based_polyA.fas");
+
+my $out_neg = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">exonerate_based_polyA_negative.fas");
+while( my $seq = $in->next_seq ) {
+ my $id = $seq->display_id;
+ my $desc = $seq->desc;
+ if( -f "$alnfolder/$id.exonerate" ) {
+ my $searchio = Bio::SearchIO->new(-format => 'exonerate',
+ -file => "$alnfolder/$id.exonerate");
+ while( my $r = $searchio->next_result ) {
+ while( my $h = $r->next_hit ) {
+ print join("\t",$r->query_name,
+ $desc,
+ $h->start('query'),
+ $h->end('query'),
+ $h->name,
+ $h->start('hit'),
+ $h->end('hit')),"\n";
+
+ if( $desc eq '5prime' ) {
+ my $target_window = $db->seq($h->name,
+ $h->start('hit') - $window =>
+ $h->start('hit') + $window -1);
+ next if ( $target_window =~ /T{5,}/ ||
+ $target_window =~ /A{5,}/ );
+
+ $out->write_seq(Bio::Seq->new(-seq => $target_window,
+ -id => $id,
+ -desc => sprintf("%s:%s",
+ $h->name,
+ $h->start('hit'))));
+
+ my $up = $db->seq($h->name,
+ $h->start('hit') - 4 * $window =>
+ ($h->start('hit') - 3 * $window - 1) );
+
+ $out_neg->write_seq(Bio::Seq->new(-id => "UP_".$id,
+ -seq => $up));
+
+ $up = $db->seq($h->name,
+ $h->start('hit') + 3 * $window =>
+ ($h->start('hit') + 4 * $window -1) );
+
+ $out_neg->write_seq(Bio::Seq->new(-id => "UP2_".$id,
+ -seq => $up));
+
+ } else {
+ my $target_window = $db->seq($h->name,
+ $h->end('hit') - $window =>
+ $h->end('hit') + $window -1);
+ next if ( $target_window =~ /T{5,}/ ||
+ $target_window =~ /A{5,}/ );
+
+ $out->write_seq(Bio::Seq->new(-seq => $target_window,
+ -id => $id,
+ -desc => sprintf("%s:%s",
+ $h->name,
+ $h->end('hit'))));
+ my $down = $db->seq($h->name,
+ $h->end('hit') + 3*$window =>
+ $h->end('hit') + 4*$window -1);
+ $out_neg->write_seq(Bio::Seq->new(-id => "DN_".$id,
+ -seq => $down));
+ $down = $db->seq($h->name,
+ $h->end('hit') - 4*$window =>
+ $h->end('hit') - 3*$window -1);
+ $out_neg->write_seq(Bio::Seq->new(-id => "DN2_".$id,
+ -seq => $down));
+ }
+ }
+ }
+ }
+}
+
View
147 analyses/polyA_site_analysis.pl
@@ -0,0 +1,147 @@
+#!/usr/bin/perl -w
+use strict;
+use Bio::DB::Fasta;
+use Bio::SeqIO;
+use Getopt::Long;
+
+my $posfile = 'positive_polyA.fa';
+my $negfile = 'negative_polyA.fa';
+my $min_len_UTR = 100;
+my $gff;
+my $polyAfasta;
+my $db;
+my $polyA_window_width = 50;
+my $upstream = 200;
+my $downstream = 150;
+
+GetOptions('p|pos:s' => \$posfile,
+ 'n|neg:s' => \$negfile,
+ 'g|gff:s' => \$gff,
+ 'f|pasa|polyA:s' => \$polyAfasta,
+ 'db:s' => \$db,
+ 'm|min:i' => \$min_len_UTR,
+ 'w|window:i' => \$polyA_window_width,
+ 'u|upstream:i' => \$upstream,
+ 'd|downstream:i' => \$downstream,
+ );
+
+# arguments
+# genome
+# GFF file with three_prime_utr features
+
+
+if( ! defined $db ) {
+ die("no DB provided\n");
+}
+
+# --[negative]--------AAAAA-----------[negative]
+# -200-150 (-50 -IN- +50) +150-200
+
+my $dbh = Bio::DB::Fasta->new($db);
+my $out_positive = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$posfile");
+
+my $out_negative = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$negfile");
+my @sites;
+if( $gff ) {
+ open(my $in => $gff) || die $!;
+ my %count;
+ while(<$in>) {
+ next if /^\#/;
+ next if /^\s+/;
+ chomp;
+ my ($seqid,$src,$type,$start,$end, $score, $strand,$frame,$group) =
+ split(/\t/,$_);
+ next unless $type eq 'three_prime_utr';
+ next unless abs($end - $start) > $min_len_UTR;
+ my %group = map { split (/=/,$_) } split(';',$group);
+ my $id = $group{'Parent'};
+ unless (defined $id ) { die "no Parent tag for feature $_\n"}
+ if( $count{$id}++ ){
+ $id .= $count{$id};
+ }
+ push @sites, [$seqid,$start, $end, $strand, $id];
+ }
+} elsif( $polyAfasta ) {
+ my $seqin = Bio::SeqIO->new(-format => 'fasta', -file => $polyAfasta);
+ while( my $seq = $seqin->next_seq ) {
+ my $id = $seq->id;
+ my ($seqid,$start,$strand);
+ if( $id =~ /^(\w+)\-(\d+)\_([\+\-])/ ) {
+ ($seqid,$start,$strand) = ($1,$2,$3);
+ } else {
+ warn("cannot match id $id\n");
+ next;
+ }
+ my $desc = $seq->desc;
+ my @l = split(/\s+/,$desc);
+ push @sites, [$seqid,$start,$start,$strand,pop @l];
+ }
+}
+
+for my $site ( @sites ) {
+ my ($seqid,$start,$end,$strand,$id) = @$site;
+ my $seqlen = $dbh->length($seqid);
+
+ my ($posseq,@negseqs);
+ if( $strand eq '-' ) {
+ my ($left,$right) = ( $start - $polyA_window_width,
+ $start + $polyA_window_width-1);
+ $right = $seqlen if $right > $seqlen;
+ $left = 1 if $left <= 0;
+
+ my $posseqstr = $dbh->seq($seqid, $left => $right);
+ $posseq = Bio::Seq->new(-seq => $posseqstr,
+ -id => "$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=rev",
+ $seqid,$right, $left));
+ $posseq = $posseq->revcom;
+ $right = $start - $downstream;
+ $left = $right - 2*$polyA_window_width+1;
+ my $negseqstr = $dbh->seq($seqid, $left => $right);
+ my $negseq = Bio::Seq->new(-seq => $negseqstr,
+ -id => "negDN_$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=rev",
+ $seqid,$right, $left));
+ push @negseqs, $negseq->revcom;
+
+ $left = $start + $upstream;
+ $right = $left + 2*$polyA_window_width-1;
+ $negseqstr = $dbh->seq($seqid, $left => $right);
+ $negseq = Bio::Seq->new(-seq => $negseqstr,
+ -id => "negUP_$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=rev",
+ $seqid,$right, $left));
+ push @negseqs, $negseq->revcom;
+ } else {
+ my ($left,$right) = ( $end - $polyA_window_width,
+ $end + $polyA_window_width-1);
+
+ my $posseqstr = $dbh->seq($seqid, $left => $right);
+ $posseq = Bio::Seq->new(-seq => $posseqstr,
+ -id => "$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=fwd",
+ $seqid,$left,$right));
+
+ $left = $start - $upstream;
+ $right = $left + 2*$polyA_window_width-1;
+ my $negseqstr = $dbh->seq($seqid, $left => $right);
+ my $negseq = Bio::Seq->new(-seq => $negseqstr,
+ -id => "negUP_$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=fwd",
+ $seqid, $left,$right));
+ push @negseqs, $negseq;
+
+ $left = $start + $downstream;
+ $right = $left + 2*$polyA_window_width-1;
+ $negseqstr = $dbh->seq($seqid, $left => $right);
+ $negseq = Bio::Seq->new(-seq => $negseqstr,
+ -id => "negDN_$id.polyA",
+ -desc => sprintf("%s:%d..%d strand=fwd",
+ $seqid, $left,$right));
+ push @negseqs, $negseq;
+ }
+ $out_positive->write_seq($posseq);
+ $out_negative->write_seq(@negseqs);
+}
View
73 analyses/trimpoly_predict_polysites.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+use Bio::DB::Fasta;
+use Bio::SeqIO;
+
+my %est;
+
+my ($trimpoly,$gff_aln,$db,$genome) = @ARGV;
+my ($queryfolder,$hitfolder,$alnfolder) = ('query','target','alns');
+
+mkdir($queryfolder);
+mkdir($hitfolder);
+mkdir($alnfolder);
+my $genome_dbh = Bio::DB::Fasta->new($genome);
+my $dbh = Bio::DB::Fasta->new($db);
+
+open(my $in => $trimpoly) || die $!;
+my $out = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$trimpoly.seqs");
+while(<$in>) {
+ my ($seqid,$perc_N,$end5,$end3,$initial_length) = split;
+ my ($len) = abs($end3 - $end5) + 1;
+ if( $initial_length != $len ) {
+ $est{$seqid} = [$end5,$end3,$initial_length];
+# print ("$seqid $end5 $end3 $initial_length\n");
+ my @polyA;
+ if( $end5 > 1 ) {
+ push @polyA, "5prime";
+ }
+ if( $end3 < $initial_length ) {
+ push @polyA, "3prime";
+ }
+ my $seqstr = $dbh->seq($seqid,$end5 => $end3);
+ my $s = Bio::Seq->new(-seq => $seqstr,
+ -id => $seqid,
+ -desc => join(",",@polyA));
+ $out->write_seq($s);
+ Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$queryfolder/$seqid")->write_seq($s);
+ Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$queryfolder/$seqid.untrim")->write_seq(
+ $dbh->get_Seq_by_acc($seqid));
+ }
+}
+
+open($in => $gff_aln ) || die $!;
+open(my $jobfh => ">run_exonerate.sh");
+print $jobfh "module load exonerate\n";
+my %seen;
+while(<$in>) {
+ next if /^\s+$/ || /^\#/;
+ chomp;
+ my ($seqid,$source,$type,$start,$end,$score,$strand,$frame,$group) = split(/\t/,$_);
+ my %group = map { split(/=/,$_) } split(/;/,$group);
+ my $target_str = $group{'Target'} || die "no target in feature\n";
+ my ($target,$tstart,$tend,$tstrand) = split(/\s+/,$target_str);
+ if( $est{$target} ) {
+ printf "%s -> %d..%d (%d bp) to aln parts %d..%d (%s) on genome %s:%d..%d (%s)\n",
+ $target,@{$est{$target}},
+ $tstart,$tend,$tstrand,
+ $seqid,$start,$end,$strand;
+ if( ! -f "$hitfolder/$seqid" ) {
+ my $outquery = Bio::SeqIO->new(-format => 'fasta',
+ -file => ">$hitfolder/$seqid");
+ $outquery->write_seq($genome_dbh->get_Seq_by_acc($seqid));
+ }
+ if( ! $seen{$target}++ ) {
+ #only need to request the alignment once
+ print $jobfh "exonerate -m est2genome --refine region --bestn 1 -q $queryfolder/$target -t $hitfolder/$seqid --showtargetgff > $alnfolder/$target.exonerate\n";
+ }
+ }
+
+}
Please sign in to comment.
Something went wrong with that request. Please try again.