Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 279 lines (235 sloc) 9.52 KB
# run this script in a small xterm and paste urls
# into it, enter grep commands, then press ^D to
# clean up and repair urls and load them into
# firefox.
# summary: helper for firefox
# copyright: (c)2007-2009 GPL v3
# archive:
# created: 20070211
# last change: 20090620
use strict;
use vars;
# Bugs:
# - actually this should be a part of the firefox wrapper,
# as we may be called by it and do call it ourselves
# rather inefficiently...
# - maybe cache the extracted bookmarks in reloadable format
# --> freeze/thaw/storable... unless updated
# [even then, I'd better only update at most once/day
# unless requested by deletion]
# - firefox grep should be included not executed, and in
# turn split into extraction and selection, so multiple
# greps are require only ONE read-in of all the bookmarks
# alternately: combine the kw/regexes? with or without
# retaining their ordering?
$runlength=40; # max no of urls to pass before waiting for RETURN
# for o_pat: recognize URLs in arbitrary text [also skips GREP lines]
# generic pattern to recognice protocols or servernames at start of url
# used in the split-loop invoking firefox at the end
my $servermatch='([a-zA-Z0-9\-]+(\.[A-Za-z0-9\-]+)+(?:[:\./#]|$))';
# use with cat FOLDER | formail -m4 -c -ds metamail -d | firefoxstdin -x
do{$o_pat=1,next} if /^-(x|extract)$/; # removing also GREPs, etc
do{$o_verbose=1,next} if /^-(v|verbose)$/;
do{$o_norun=1,$runlength=999;next} if /^-(n|l|list|norun)$/;
do{$runlength=shift, next} if /^-(p|runlength|pagesize)$/;
do{$runlength=999, next} if /^-np$/;
do{&help; die} if /^-?-(h|help)$/;
do{push @firefoxopt, $_; next} if /^-(3|sm)$/; # collect and forward to firefox later on
last if /^--$/;
# short cuts: url collections, bookmark grepping,
# bookmark keyword/shortcut url expansion
do{$ARGV[0]=~s/'/\\'/g; $ARGV[1]=~s/'/\\'/g;
$urls.="GREP -k '::$ARGV[0]::' '$ARGV[1]'\n",shift,shift,next}
if /^-(k|kw|key|keyword)$/;
$urls.="GREP -i '$ARGV[0]'\n",shift,next}
if /^-grep$/;
do{$urls.="GREP 'daily ::'\n",next} if /^-daily$/;
do{$urls.="GREP 'nightly ::'\n",next} if /^-nightly$/;
do{$urls.="GREP 'weekly ::'\n",next} if /^-weekly$/;
do{$urls.="GREP 'monthly ::'\n",next} if /^-monthly$/;
do{$urls.="GREP 'monthly1 ::'\n",next} if /^-monthly1$/;
do{$urls.="GREP 'monthly2 ::'\n",next} if /^-monthly2$/;
do{$urls.="GREP 'monthly ::'\nGREP 'monthly2 ::'\nGREP 'monthly1 ::'\n",next} if /^-monthly0$/;
# note that news.daily is impl. directly for speed
do{$urls.="GREP 'news.daily ::'\n",next} if /^-news(\.daily)$/;
do{$urls.="GREP 'jobs.daily ::'\n",next} if /^-jobs(\.daily)$/;
unshift @ARGV,$_;
# bugs
# - might be useful to use a more lenient $pat allowing for GREP
# short notations like as a sanity filter on auto-expanded
# urls [i.e. lines containing quoting, =3A, etc, pp]
sub help {
print <<EOF;
cat - | $0 [OPTIONS] [URL ...]
\$0 accepts URLs or GREP strings as arguments or from stdin. It expands
GREP strings into matching URLs from firefox bookmarks and tries to
correct URLs that have look like being pasted from email or text. This
included truncated http: prefix, trailing dots or
quoted-printable-damage. In case of doubt, multiple URLs are
generated. Finally, all URLs are fed to firefox, one per invocation,
with an option to pause every n URLs (default: $runlength).
- / -- last option
-h / -help / --help
-n / -list / -norun list URLs, but do not feed them to firefox
-np no pause (actually -p 999)
-p / -runlength / -pagesize RUNLENGTH
pause for user input every RUNLENGTH URLs
-v / -verbose
-x / -extract: extract URLs from arbitrary text, ignoring
GREPs and incomplete URLs
GREP bookmark options (REGEX/QUERY is single-quoted):
-grep REGEX GREP -i within the firefox bookmarks
-k / -kw / -key / -keyword REGEX QUERY
GREP for a shortcut URL with a keyword
fully matching REGEX and substitute %s with
the QUERY string
[may fail for POST]
Predefined GREPs:
-daily / -weekly / -monthly / -news.daily / -jobs.daily
GREP for 'OPTIONNAME ::' in the bookmarks
- the option -kw google "my google query for o'reilly" is translated to
GREP -k '::google::' 'my google query for o\\'reilly', which also is
the format to enter this query on stdin. Both the :: and the -k merely
restrict the fields the GREP is looking at. The :: denote the keyword
field, so any google in the url won't match. To just list the matching
URLs, do use the shell command firefoxgrep with the arguments
-k ::google:: "my google query for o'reilly".
- as seen above, GREP lines from stdin need to obey shell-quoting rules
[at the moment, GREPPing is done by the external script firefoxgrep]
- non-url-ish lines are ignored, e.g. lines containing only a single word.
use -x for more severe testing (requiring sane hostnames and/or protocols)
acc. to the contents of the \$pat expression.
undef $/;
if (@ARGV or not $urls) {
if (not @ARGV and not $urls) {
print main::STDERR "# firefoxstdin bookmark support: GREP [-k EXPR] EXPR (shell quoting!)\n\n" if -t "/dev/stdin";
print main::STDERR "\n# processing list\n"
} else {
#print main::STDERR "urls:\n$urls\n\n";
sub argv2url {
if (open(FH, "<",$_)) {
while(read FH, $tmp, 100) {
close FH;
} else {
# if it doesn't exist in the filesystem, it's not a file containing urls but an url itself
s@[\r\n]+@\n@go; # sanitize crlf
# try correcting mail-provided multi-line urls
$org=$tmp=$1; $tmp=~s%=\n%%go;
# ditto with \n+ type splits
s@(^firefox (.*\n\+)+.*\n)@do{
$org=$tmp=$1; $tmp=~s%\n\+% %go;
# the user should take more care in pasting...
$org=$tmp1=$tmp2=$1; $tmp1=~s%\n\+%%go; $tmp2=~s%\n\+% %go;
# infamous punctuation at EOL; ordinarily it could be
# stripped, but some servers return other pages ...
$org=$tmp=$1; $tmp=~s%[\.,:;](?=\n)%%go;
# <url>
s@^\s*<(.*)>(\s.*$|$)@$1 $2@mgo;
# if prefixed with firefox, strip it and make each subsequent url
# into a single line
s@^[\t ]*firefox[\t ]+(.*)@do{
$tmp=~s! !\n!go;
# extra text filtering if input is a just a pasted demimed email
# e.g. from cat FOLDER | formail -m4 -c -ds metamail -d | $0
# with suitable mimedecode, metamail, ... options
# and formail -c to make all headers just one line - i.e. get
# filtered out with the option below
if ($o_pat) {
# because of country codes, we cannot further require .org, .com, .net...
# mailto, etc is ignored, as it doesn't make much sense in browse all urls context
# note that mere browsing can also result in actions like confirm
# email opt in...
s@^(?!$pat).*@@mgoi; # it must match $pat
s@($pat).*@$1@mgoi; # and anything else in the line is removed
# expand booksmarks [user input must handle shell quoting]
# BUG: really invoke the grep independently each time!?
#warn "firefoxgrep $2\n";
`firefoxgrep $2`
# cleaning urls
s@^\s*[\+\-\*\.]@@mgo; # list-bullet char followed by url
s@["'"'"'>\s]+#.*$@@go; # trailing synsugar with eol comments
s@["'"'"'>\s]*$@@go; # just trailing synsuger
s@^["'"'"'<\s]*@@go; # leading synsugar
s@\s*\n\s*@\n@go; # compress empty lines
s@[,\.\;\:\s]*$@@go; # drop trailing interpunction
@urls=split /\n/, $_;
foreach(@urls) {
do{system "bash -c 'echo press return to continue; (read a)</dev/tty'"; $i=0} if $runlength==$i and not $o_norun;
if (/^\s*#/ or not m@^(?:[a-zA-Z]+?:/?/?)?$servermatch@ and not /@/) {
print "skipping $_\n" if $o_norun or $o_verbose;
} else {
sleep 2 if $i and $i % 5 and not $o_norun; # short sleeps
# might reduce number of x communication errors
print "running " if $o_verbose;
print "$_\n" if $o_norun or $o_verbose;
system "firefox", @firefoxopt, "$_" if not $o_norun; # which include pinging, remoting and sleeping to avoid probs
some test data:
1 extraction via firefoxstdin -x -n of
a b
thuis dfhyy f6y gfugf.
gdhjutju kl;rmtdm r
should return
running www.leo