Skip to content
Newer
Older
100755 364 lines (305 sloc) 11.5 KB
54e85fa @jgm initial commit
authored
1 #!/usr/bin/perl -w
2 #
62616b7 @jgm proper order of sections, notes at end
authored
3 # Version history:
4 #
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
5 # sep-offprint 1.11 - John MacFarlane - August 16, 2007
6 # + use File::Spec and File::Basename for platform-independent
7 # manipulation of files and directories.
fee5a00 @jgm added --output option
authored
8 # sep-offprint 1.1 - John MacFarlane - August 16, 2007
9 # + added --output|o option to specify output filename
62616b7 @jgm proper order of sections, notes at end
authored
10 # sep-offprint 1.0 - John MacFarlane - July 19, 2007
11 # + include supplements in the ordered they are linked to
12 # + always put notes at the end
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
13 # + removed --localpath option; use file:/// URL instead
acddfb4 @jgm fixed regex for stripping off SEP header (thanks to George Galfalvi)
authored
14 # sep-offprint 0.9 - John MacFarlane - March 8, 2007
15 # + fixed regex for stripping off SEP header (thanks to George Galfalvi)
0dae500 @jgm strip off (S. E. P.) from HTML title
authored
16 # sep-offprint 0.8 - John MacFarlane - February 22, 2007
17 # + strip off "(Stanford Encyclopedia of Philosophy)" from
364f485 @jgm thank Uri Nodelman instead of Ed Zalta
authored
18 # HTML title (thanks to Uri Nodelman)
f0bf547 @jgm Bug fixes due to Dan Robins
authored
19 # sep-offprint 0.7 - John MacFarlane - January 23, 2007
a0cf069 @jgm small comment change
authored
20 # + include supplements, if present (thanks to Dan Robins)
afbab8a @jgm added error checking for presence of index.html
authored
21 # + removed unnecessary call to lwp-rget (Dan Robins)
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
22 # + added --linkcolor option (JM and Dan Robins)
afbab8a @jgm added error checking for presence of index.html
authored
23 # + added error checking: error exit if index.html not found
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
24 # + fixed '--version' and adjusted '--help' output
e916cb0 @jgm changed version to 0.6
authored
25 # sep-offprint 0.6 - John MacFarlane - August 30, 2006
fc5066a @jgm version bump
authored
26 # sep-offprint 0.5 - John MacFarlane - August 25, 2006
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
27 # sep-offprint 0.4 - John MacFarlane - August 22, 2006
54e85fa @jgm initial commit
authored
28 # sep-offprint 0.3 - John MacFarlane - May 25, 2005
29 #
62616b7 @jgm proper order of sections, notes at end
authored
30 # Synopsis:
31 #
54e85fa @jgm initial commit
authored
32 # produces a PDF or postscript "offprint" of a Stanford
33 # Encyclopedia of Philosophy (SEP) article
34 #
35 # Argument is an entry name from SEP, as it appears in the URL.
36 # For example, to get the article on classical logic, which is at
37 # http://plato.stanford.edu/entries/logic-classical/, just type
38 #
39 # perl sep-offprint logic-classical
40 #
f0bf547 @jgm Bug fixes due to Dan Robins
authored
41 # and it will create logic-classical.pdf.
54e85fa @jgm initial commit
authored
42 #
43 # There are many command-line options. For a list, type
44 #
f0bf547 @jgm Bug fixes due to Dan Robins
authored
45 # perl sep-offprint --help
54e85fa @jgm initial commit
authored
46 #
b294df2 @jgm use lwp-rget instead of wget
authored
47 # The programs html2ps and ps2pdf must be in the user's path:
54e85fa @jgm initial commit
authored
48 #
49 # html2ps can be found at http://user.it.uu.se/~jan/html2ps.html.
50 # Download the tarball or zip file and run the "install" script.
51 #
52 # ps2pdf is part of Ghostscript -- many users will have it
53 # already: http://www.cs.wisc.edu/~ghost/doc/AFPL/get851.htm
54 #
b294df2 @jgm use lwp-rget instead of wget
authored
55 # In addition, the LWP package for Perl must be installed.
56 #
54e85fa @jgm initial commit
authored
57 # For more information and updates, see
58 # http://philosophy.berkeley.edu/macfarlane/sep-offprint.html
59
62616b7 @jgm proper order of sections, notes at end
authored
60 my $version_number = '1.0';
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
61
54e85fa @jgm initial commit
authored
62 use Getopt::Long;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
63 use File::Temp qw/ tempdir /;
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
64 use File::Copy;
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
65 use File::Basename;
66 use File::Spec;
54e85fa @jgm initial commit
authored
67
9e79250 @jgm print completion messages to STDERR
authored
68 # printhelp - returns a usage message
62616b7 @jgm proper order of sections, notes at end
authored
69
54e85fa @jgm initial commit
authored
70 sub printhelp {
71 die
72 "Produces a PDF offprint from a Stanford Encyclopedia of Philosophy article.
73 (http://plato.stanford.edu/)
74
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
75 Usage: sep-offprint [options] <entry name>
54e85fa @jgm initial commit
authored
76
77 Examples: sep-offprint russell
78 sep-offprint --1up --ps --paper a4 frege
79
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
80 Options (* indicates a default):
54e85fa @jgm initial commit
authored
81
82 --1up print one page per sheet, portrait orientation
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
83 --2up print two pages per sheet, landscape orientation*
54e85fa @jgm initial commit
authored
84 --ps produce postscript (PS) output
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
85 --pdf produce PDF output*
fee5a00 @jgm added --output option
authored
86 --output <filename> name of output file (defaults to <entryname>.ps|pdf)
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
87 --font <font> use <font> (Times*, Helvetica, Palatino, Courier)
88 --size <size> use <size> (10pt, 12pt, 14pt*, 16pt)
89 --align <align> use <align> (left, justified*)
90 --paper <papersize> specify <papersize> (letter*, legal, a4)
91 --linkcolor <color> specify color of hyperlinks (black*, gray, blue, ...)
54e85fa @jgm initial commit
authored
92 --help this message
93 --version prints version number\n";
94 }
95
62616b7 @jgm proper order of sections, notes at end
authored
96 # slurp - slurps contents of a file and returns as a string;
97 # takes filename as argument
98
99 sub slurp {
100 my $file = shift;
101 local( $/, *FILE );
102 open(FILE, "< $file") or die "Couldn't open $file to read";
103 my $contents = <FILE>;
104 close(FILE);
105 return $contents;
106 }
107
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
108 # uniq - remove duplicates from an array, preserving the order of the original
109
110 sub uniq {
111 my @in = @_;
112 undef %seen;
113 grep(!$seen{$_}++, @in);
114 }
115
62616b7 @jgm proper order of sections, notes at end
authored
116 # preprocess html - preprocess HTML file, stripping out navigation bars,
117 # etc., and replacing entity references with appropriate characters or images.
118 # takes filename as argument
119
120 sub preprocess_html {
121 my $file = $_;
122 my $contents = slurp $file;
123
124 # get rid of header stuff
125 $contents =~ s/<body>.*?<!--DO NOT MODIFY THIS LINE AND ABOVE-->/<body><div id="content"><div id="aueditable">/gs;
126
127 # get rid of "(Stanford Encyclopedia of Philosophy)" in title:
128 $contents =~ s/<title>(.*)\ \(Stanford Encyclopedia of Philosophy\)/<title>$1/;
129
130 # make publication date into regular paragraph
131 $contents =~ s/<br \/><span class="xsmall">(.*)<\/span><\/h1>/<\/h1><p>$1<\/p>/g;
132
133 # center copyright notice
134 $contents =~ s/<div id="foot">(.*?)<\/div>/<center>$1<\/center>/gs;
135
136 # replace unicode character references
137 %replacements = (
138 "&\#133;" => "&hellip;",
139 "&\#145;" => "&lsquo;",
140 "&\#146;" => "&rsquo;",
141 "&\#147;" => "&ldquo;",
142 "&\#148;" => "&rdquo;",
143 "&\#149;" => "&bull;",
144 "&\#150;" => "&minus;",
145 "&\#257;" => "a",
146 "&\#261;" => "a",
147 "&\#263;" => "c",
148 "&\#269;" => "c",
149 "&\#281;" => "e",
150 "&\#299;" => "i",
151 "&\#321;" => "L",
152 "&\#322;" => "l",
153 "&\#324;" => "n",
154 "&\#333;" => "o",
155 "&\#345;" => "r",
156 "&\#346;" => "S",
157 "&\#347;" => "s",
158 "&\#351;" => "s",
159 "&\#363;" => "u",
160 "&\#365;" => "u",
161 "&\#369;" => "u",
162 "&\#378;" => "z",
163 "&\#380;" => "z",
164 "&\#381;" => "Z",
165 "&\#599;" => "u",
166 "&\#768;" => "",
167 "&\#769;" => "",
168 "&\#770;" => "",
169 "&\#771;" => "",
170 "&\#772;" => "",
171 "&\#773;" => "",
172 "&\#775;" => "",
173 "&\#803;" => "",
174 "&\#8209;" => "-",
175 "&\#8600;" => "<img alt=\"southeast-arrow\" src=\"http:\/\/plato.stanford.edu\/symbols\/searrow.gif\">",
176 "<sup>&\#9484;<\/sup>" => "<img alt=\"left-corner-quote\" src=\"http:\/\/plato.stanford.edu\/symbols\/l-corner-quote.gif\">",
177 "<sup>&\#9488;<\/sup>" => "<img alt=\"right-corner-quote\" src=\"http:\/\/plato.stanford.edu\/symbols\/r-corner-quote.gif\">",
178 "&\#8463;" => "<img alt=\"hbar\" src=\"http:\/\/plato.stanford.edu\/symbols\/hbar.gif\">",
179 "&\#9633;" => "<img alt=\"Box\" src=\"http:\/\/plato.stanford.edu\/symbols\/Box.gif\">"
180 );
181 while ( my ($ref, $rep) = each(%replacements) ) {
182 $contents =~ s/$ref/$rep/g;
183 }
184
185 # write back to file
186 open(FILE, "> $file") or die "Couldn't open $file to write";
187 print FILE $contents;
188 close(FILE);
189 }
190
191 #
192 # parse command-line options
193 #
194
54e85fa @jgm initial commit
authored
195 GetOptions( '1up|1' => \$oneup,
196 '2up|2' => \$twoup,
197 'ps' => \$ps,
198 'pdf' => \$pdf,
fee5a00 @jgm added --output option
authored
199 'output|o=s' => \$outfile,
54e85fa @jgm initial commit
authored
200 'font=s' => \$fontfamily,
201 'size=s' => \$fontsize,
202 'align=s' => \$textalign,
203 'paper=s' => \$papersize,
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
204 'linkcolor=s' => \$linkcolor,
54e85fa @jgm initial commit
authored
205 'help|h' => \$help,
206 'version|v' => \$version);
207
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
208 if ($version) {die "sep-offprint $version_number\n";};
209
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
210 if ($#ARGV < 0) {&printhelp;};
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
211 $sourceArg = $ARGV[0];
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
212
862b5dd @jgm fixed bug when URL has trailing slash
authored
213 # remove trailing slash, if any, from sourceArg:
214 $sourceArg =~ s{/$}{};
215
9e79250 @jgm print completion messages to STDERR
authored
216 # derive entry name from argument:
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
217 $entryname = $sourceArg;
218
62616b7 @jgm proper order of sections, notes at end
authored
219 # remove uppercase and spaces
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
220 $entryname =~ tr/A-Z/a-z/;
221 $entryname =~ tr/ /-/;
222
62616b7 @jgm proper order of sections, notes at end
authored
223 # remove /index.html if specified
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
224 $entryname =~ s{/index\.html$}{};
225
226 # remove URL prefix (everything before slash)
227 $entryname =~ s{.*/}{};
228
229 if ($sourceArg =~ /^file:/) {
230 $source = $sourceArg; # file URL was specified - use local source
231 }
232 else {
233 $source = "http://plato.stanford.edu/entries/$entryname/";
234 }
235 $footer = $source;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
236
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
237 $current = File::Spec->curdir; # working directory from which sep-offprint is run
238
54e85fa @jgm initial commit
authored
239 if ($help) {&printhelp;};
240 if (not ($pdf or $ps)) {$pdf=1};
241 if ($oneup) {$twoup = 0} else {$twoup = 1};
242 if (not $fontsize) {$fontsize = "14pt"};
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
243 if (not $outfile) { $outfile = $entryname; }
54e85fa @jgm initial commit
authored
244 if (not $fontfamily) {$fontfamily = "Times"};
245 if (not $textalign) {$textalign = "justify"};
246 if (not $papersize) {$papersize = "letter"};
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
247 if (not $linkcolor) {$linkcolor = "black"};
54e85fa @jgm initial commit
authored
248
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
249 # strip .pdf or .ps extension from outfile name, and add path:
250 my($filename, $directories, $suffix) = fileparse($outfile,qr/\.pdf|\.ps/);
251 if (not $directories) { $directories = $current };
252 my $outpath = File::Spec->rel2abs(File::Spec->catfile($directories,$filename));
253
62616b7 @jgm proper order of sections, notes at end
authored
254 # create temporary directory
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
255 $temp = tempdir ( CLEANUP => 1 );
256
62616b7 @jgm proper order of sections, notes at end
authored
257 # get all the source files and put them in temp directory,
258 # then change to temp directory
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
259
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
260 chdir $temp;
261 # download all the HTML files
262 print STDERR "Retrieving files...\n";
263 $downloadedFiles = `lwp-rget --limit=200 $source/index.html 2>&1`;
264 (-e "index.html") or die "Could not retrieve files from $source\nAre you sure you have the right entry name?\n";
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
265
62616b7 @jgm proper order of sections, notes at end
authored
266 # create blank html file to work around html2ps bug.
267 # without this blank file after notes.html, html2ps will cut off
54e85fa @jgm initial commit
authored
268 # the last page of an entry if it occurs in the left column in 2up mode.
269
f0bf547 @jgm Bug fixes due to Dan Robins
authored
270 $blank = "blankpage";
54e85fa @jgm initial commit
authored
271
272 open FILE, ">$blank" or die "unable to open $blank: $!";
273
274 print FILE <<EOF;
275 <html>
276 <head>
277 <title>&nbsp;</title>
278 </head>
279 <body>
280 <p>&nbsp;</p>
281 </body>
282 </html>
283 EOF
284
285 close FILE;
286
62616b7 @jgm proper order of sections, notes at end
authored
287 # create a configuration file with appropriate footers
54e85fa @jgm initial commit
authored
288
62616b7 @jgm proper order of sections, notes at end
authored
289 $html2psrc = "html2psrc";
54e85fa @jgm initial commit
authored
290
291 open FILE, ">$html2psrc" or die "unable to open $html2psrc: $!";
292
293 print FILE <<EOF;
294 BODY {
295 font-size: $fontsize;
296 font-family: $fontfamily;
297 text-align: $textalign;
298 }
f0bf547 @jgm Bug fixes due to Dan Robins
authored
299 A:link {
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
300 color: $linkcolor;
f0bf547 @jgm Bug fixes due to Dan Robins
authored
301 }
54e85fa @jgm initial commit
authored
302 \@page {
303 margin-left: 2.5cm;
304 margin-right: 2.5cm;
305 margin-top: 2.5cm;
306 margin-bottom: 2.5cm;
307 }
308 \@html2ps {
309 option {
310 twoup: $twoup;
311 landscape: $twoup;
312 number: 0;
313 }
314 paper { type: $papersize }
315 header {
316 right: "STANFORD ENCYCLOPEDIA OF PHILOSOPHY";
317 left: \$T;
318 }
319 footer {
320 left: \$N;
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
321 right: $footer;
54e85fa @jgm initial commit
authored
322 }
323 }
324 EOF
325
326 close FILE;
327
328 # name of temporary file to hold postscript output of html2ps
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
329 $pstemp = "pstemp";
54e85fa @jgm initial commit
authored
330
62616b7 @jgm proper order of sections, notes at end
authored
331 # preprocess all the html files in the working (i.e., temp) directory
332 preprocess_html foreach <*.html>;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
333
62616b7 @jgm proper order of sections, notes at end
authored
334 #
335 # determine the order in which the HTML pages should be processed:
336 #
d4664e0 @jgm center copyright notice
authored
337
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
338 @htmlFiles = $downloadedFiles =~ /^.*\.html$/gim;
62616b7 @jgm proper order of sections, notes at end
authored
339
340 # make a space-separated list of the HTML files to process, in order
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
341 my $orderedHtmlFiles = join(' ', @htmlFiles);
62616b7 @jgm proper order of sections, notes at end
authored
342
343 # set $notes to "notes.html" if there are notes
344 my $notes = "";
345 if ($orderedHtmlFiles =~ /notes\.html/) {
346 $notes = "notes.html"
347 }
348
349 # discard index.html and notes.html from the list
350 $orderedHtmlFiles =~ s/(index|notes)\.html//g;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
351
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
352 print STDERR "Creating offprint...\n";
353
62616b7 @jgm proper order of sections, notes at end
authored
354 # call html2ps to create the postscript version of the entry
355 system("html2ps -D -U -f $html2psrc -o $pstemp index.html " . $orderedHtmlFiles . " $notes $blank");
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
356
62616b7 @jgm proper order of sections, notes at end
authored
357 # create pdf if requested
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
358 if ($pdf) {system("ps2pdf -sPAPERSIZE=$papersize $pstemp $outpath.pdf") || print "Created $outpath.pdf\n";};
54e85fa @jgm initial commit
authored
359
62616b7 @jgm proper order of sections, notes at end
authored
360 # copy ps file if requested
daabc5c @jgm use File::Spec and File::Basename for platform-independent manipulati…
authored
361 if ($ps) {copy($pstemp, "$outpath.ps") && print "Created $outpath.ps\n";};
54e85fa @jgm initial commit
authored
362
b2fde1b @jgm cleanup on sep-offprint
authored
363 # note: temporary directory will be deleted automatically on exit
Something went wrong with that request. Please try again.