Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 301 lines (273 sloc) 8.637 kB
#!/usr/bin/perl
# This script takes two arguments:
# 1. A two-letter language code.
# 2. A directive number NNNN/NN/EC (or local variant) or an URL.
use strict;
use utf8;
use Tratten::Misc;
use Digest::MD5 qw(md5_hex);
use Digest::SHA qw(sha1_hex sha256_hex);
binmode STDOUT, ":utf8";
our %vocab = (
EN => {
EC => "EC",
source => "Source",
basis => "BASIS",
recitals => "RECITALS",
articles => "ARTICLES",
chapter => "CHAPTER",
annex => "ANNEX",
part => "Part",
recital => "Recital",
article1 => undef,
article => "Article",
references => "References",
},
SV => {
EC => "EG",
source => "Källa",
basis => "GRUND",
recitals => "SKÄL",
articles => "ARTIKLAR",
chapter => "KAPITEL",
annex => "BILAGA",
part => "Del",
recital => "Skäl",
article1 => undef,
article => "Artikel",
references => "Referenser",
},
DA => {
EC => "EF",
source => "Kilde",
basis => "GRUNDLAG",
recitals => "BETRAGTNINGER",
articles => "ARTIKLER",
chapter => "KAPITEL",
annex => "BILAG",
part => "Del",
recital => "Betragtning",
article1 => undef,
article => "Artikel",
references => "Referencer",
},
IT => {
EC => "CE",
source => "Fonte",
basis => "BASE",
recitals => "CONSIDERANDI",
articles => "ARTICOLI",
chapter => "CAPO",
annex => "ALLEGATO",
part => "Parte",
recital => "Considerando",
article1 => undef,
article => "Articolo",
references => "Riferimenti",
},
FR => {
EC => "CE",
source => "Source",
basis => "BASE",
recitals => "CONSIDÉRANTS",
articles => "ARTICLES",
chapter => "CHAPITRE",
annex => "ANNEXE",
part => "Partie",
recital => "Considérant",
article1 => "Article premier",
article => "Article",
references => "Références",
},
EL => {
EC => "ΕΚ",
source => "Πηγή",
basis => "ΒΑΣΗ",
recitals => "ΑΙΤΙΟΛΟΓΙΚΟ",
articles => "ΑΡΘΡΑ",
chapter => "ΚΕΦΑΛΑΙΟ",
annex => "ΠΑΡΑΡΤΗΜΑ",
part => "Μέρος",
recital => "Αιτιολογική",
article1 => undef,
article => "Άρθρο",
references => "Αναφορές",
},
RO => {
EC => "CE",
source => "SURSA",
basis => "BASE",
recitals => "RECITAL",
articles => "ARTICOLUL",
chapter => "CAPITOLUL",
annex => "ANEXA",
part => "Parte",
recital => "Recital",
article1 => undef,
article => "Articolul",
references => "Referinţe",
},
BG => {
EC => "ЕО",
source => "Източник",
basis => "основа",
recitals => "съображения",
articles => "членове",
chapter => "глава",
annex => "приложение",
part => "част",
recital => "съображения",
article1 => undef,
article => "Член",
references => "Справки",
},
);
my $version_string = &get_version_string;
our ($lang, $year, $num, $url);
our ($md5, $sha1, $sha256);
while (@ARGV) {
utf8::decode($_ = shift);
if (/^[a-zA-Z]{2}$/) { $lang = "\U$_\E" }
elsif (/^(\d+)\/(\d+)\/\p{IsUpper}{2}$/) { ($year, $num) = ($1,$2) }
elsif (/^(http:.*)$/) { $url = $1; }
else { die "Unrecognized argument: $_"}
}
die "Missing argument" unless $lang and (($year and $num) or $url);
die "No vocabulary defined for language $lang" unless $vocab{$lang};
print join ' ', map { $_ eq $lang ? "" : "[[\L$_\E:$year/$num/" . $vocab{$_}->{"EC"} . "]]" } keys %vocab;
print "\n";
%vocab = %{$vocab{$lang}};
$year = "19$year" if $year < 100;
$num = sprintf "%04d",$num;
our @urls =
("http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:3${year}L$num:$lang:HTML",
"http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:3${year}D$num:$lang:HTML");
@urls = $url if $url;
do {
$url = shift @urls or die "No working url";
print STDERR "$url\n";
$_ = `curl --url $url` or die;
$md5 = md5_hex($_);
$sha1 = sha1_hex($_);
$sha256 = sha256_hex($_);
utf8::decode($_);
} until /<title>/;
print "'''$1'''\n----\n" if /^\s*<strong>\s*(\S.+\S)\s*<\/strong>\s*$/m;
/<TXT_TE>\s*(.*)\s*<\/TXT_TE>/s or die;
$_ = $1;
s/<p>/\n/g;
s/<\/p>/\n/g;
s/^\s+/\n/gm;
#s/\n\n\n+/\n\n/gs; <-- Apparently hangs on large input...
s/\b(\d+\/\d+\/$vocab{EC})\b/[[$1]]/g;
s/^\s*-\s+(.*)$/\n* $1/gm;
s/^\s*$vocab{chapter}\s+(\w+)\s*$/\n==$vocab{chapter} $1==/gmi;
s/^\s*$vocab{annex}\s*$/\n==$vocab{annex}==/gmi;
s/^\s*$vocab{annex}\s+(\w+)\s*$/\n==$vocab{annex} $1==/gmi;
our $annex;
if (/^==$vocab{annex}/mp) {
($_, $annex) = (${^PREMATCH}, ${^MATCH} . ${^POSTMATCH});
$annex =~ s/^\s*($vocab{part}\s+\w+:\s+.*)$/\n====$1====/gmi;
}
s/^\s*$vocab{article1}\s*$/\n===$vocab{article1}===/gmi if defined $vocab{article1};
s/^\s*$vocab{article}\s+(\d+)\s*$/\n===$vocab{article} $1===/gmi;
s/^\s*$vocab{article}\s+(\d+)\s+(.*?)\s*$/\n===$vocab{article} $1===\n''$2''\n/gmi;
/^===($vocab{article} 1|$vocab{article1})===$/mp or die;
($_, our $articles) = (${^PREMATCH}, ${^MATCH} . ${^POSTMATCH});
s/^\s*\((\d+)\)\s+/\n===$vocab{recital} $1===\n/gm;
s/^\s*(\d+)\.\s+/\n===$vocab{recital} $1===\n/gm;
s/^(===$vocab{recital} 1===)$/\n==$vocab{recitals}==\n$1/m;
$_ .= "==$vocab{articles}==\n" unless /^==$vocab{chapter} /m;
$_ .= $articles . $annex;
our $done;
our ($L,$R) = ("(",")");
($L,$R) = ("[","]") if /^\s*\[\d+\]\s+/m;
while (/(\n\s*\Q$L\E\d+\Q$R\E\s+[^\n]*)+/sp) {
my @refs = ();
$done .= ${^PREMATCH} . "\n";
$_ = ${^POSTMATCH};
my $body = ${^MATCH};
$refs[$1] = $2 while $body =~ /^\s*\Q$L\E(\d+)\Q$R\E\s+(.*)$/gm;
$done =~ s/\Q$L\E(\d+)\Q$R\E/$refs[$1] ? "<ref>$refs[$1]<\/ref>" : $&/ge;
}
$_ = $done . $_;
our @stack;
our $accu;
our $lvl;
our $li = qr/
[A-Z]+(?{$lvl=1})|\d+(?{$lvl=2})|[a-z]+(?{$lvl=3})
|(?(?{$lang eq "EL"})ι?(?:[α-εζ-ι]|στ)(?{$lvl=4})|(*FAIL))
/xo;
sub indent { " " x (@stack * 2 - 1 + ($_[0] || 0)) }
sub maybe_indent { $accu .= &indent(@_) if $accu =~ /\n$/s }
sub end {
die unless @stack;
&maybe_indent;
shift @stack;
$accu .= "</li>\n" . &indent(1) . "</ol>\n";
}
sub chomps {
#$accu =~ s/(<br\/>|\s|\n)*$/$_[0]/s;
my $c = substr $accu, -1;
if ($c eq "\n" or $c eq " ") { chop $accu; goto &chomps }
elsif ($c eq ">" and substr($accu, -5) eq "<br/>") {
$accu = substr $accu, 0, -5;
goto &chomps;
}
}
my $should_nl = 0;
my $extended_hack = 0;
sub accu_ol { $accu .= $_[0]; $should_nl = 1; $extended_hack = 0 }
sub accu_li { $accu .= $_[0]; $should_nl = 1; $extended_hack = 0 }
sub accu {
if (not $extended_hack and $stack[0] == 3 and scalar @stack > 1) {
&chomps;
&end;
}
$accu .= $_[0]; $should_nl = 1;
}
our @lines = split /^/;
while (@lines and ($_ = shift @lines), 1) {
if (/^\s*\(?i\)\s/ and grep /^\s*\(?ii\)\s+/, @lines[0..3]) { $extended_hack = 1; &accu(&indent(1) . $_) }
elsif (/^\s*\(?[iv]{2,}\)\s/) { &accu(&indent(1) . $_) } # assume roman
elsif (/^\s*\*\s/) { (&chomps, $accu .= "\n") if @stack; &accu($_); $should_nl = 0; }
elsif (/^\s*$/) { $accu .= @stack ? (&indent(1) . "<br/>\n") : "\n" if $should_nl }
elsif (/^\s*(?|\(?($li)\)|($li)\.)\s*(.*)$/) {
my ($a,$b) = ($1,$2);
if ($a =~ /^[A1aα]$/) {
&chomps if @stack;
$accu .= "\n" if @stack and $stack[0] < $lvl;
&end while $stack[0] >= $lvl;
&maybe_indent(1);
unshift @stack, $lvl;
if ($lvl == 1) { &accu_ol("<ol style=\"list-style-type:upper-latin\">\n" . &indent . "<li>'''$b'''\n") }
elsif ($lvl == 2 or $lvl == 4) { &accu_ol("<ol>\n" . &indent . "<li>$b\n") }
elsif ($lvl == 3) { &accu_ol("<ol style=\"list-style-type:lower-latin\">\n" . &indent . "<li>$b\n") }
else { die }
}
elsif (grep { $_ == $lvl } @stack) {
&chomps;
&end while $stack[0] != $lvl;
&maybe_indent;
if ($lvl == 1) { &accu_li("</li>\n" . &indent . "<li>'''$b'''\n") }
else { &accu_li("</li>\n" . &indent . "<li>$b\n") }
}
else { &accu(&indent(1) . $_) }
}
elsif (/^==/) { &chomps if @stack; &end while @stack; &accu($_) }
else { &accu(&indent(1) . $_) }
}
&chomps if @stack;
&end while @stack;
$_ = $accu;
print "$vocab{source}: $url\n----\n\n==$vocab{basis}==\n";
print;
print "==$vocab{references}==\n<references/>\n";
print "\n\n----\n\n";
chomp(my $date = `date -u +%F`);
my $home = "http://github.com/kattla/tratten-scripts/";
print "This page was generated on $date by $version_string, part of [$home tratten-scripts].\n\n";
print "Source URL: $url\n\n";
print "Source MD5sum: $md5\n\n";
print "Source SHA1: $sha1\n\n";
print "Source SHA256: $sha256\n\n";
Jump to Line
Something went wrong with that request. Please try again.