Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 354 lines (297 sloc) 11.384 kb
54e85fa @jgm initial commit
authored
1 #!/usr/bin/perl -w
2 #
62616b7 @jgm proper order of sections, notes at end
authored
3 # Version history:
4 #
fee5a00 @jgm added --output option
authored
5 # sep-offprint 1.1 - John MacFarlane - August 16, 2007
6 # + added --output|o option to specify output filename
62616b7 @jgm proper order of sections, notes at end
authored
7 # sep-offprint 1.0 - John MacFarlane - July 19, 2007
8 # + include supplements in the ordered they are linked to
9 # + always put notes at the end
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
10 # + removed --localpath option; use file:/// URL instead
acddfb4 @jgm fixed regex for stripping off SEP header (thanks to George Galfalvi)
authored
11 # sep-offprint 0.9 - John MacFarlane - March 8, 2007
12 # + fixed regex for stripping off SEP header (thanks to George Galfalvi)
0dae500 @jgm strip off (S. E. P.) from HTML title
authored
13 # sep-offprint 0.8 - John MacFarlane - February 22, 2007
14 # + strip off "(Stanford Encyclopedia of Philosophy)" from
364f485 @jgm thank Uri Nodelman instead of Ed Zalta
authored
15 # HTML title (thanks to Uri Nodelman)
f0bf547 @jgm Bug fixes due to Dan Robins
authored
16 # sep-offprint 0.7 - John MacFarlane - January 23, 2007
a0cf069 @jgm small comment change
authored
17 # + include supplements, if present (thanks to Dan Robins)
afbab8a @jgm added error checking for presence of index.html
authored
18 # + removed unnecessary call to lwp-rget (Dan Robins)
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
19 # + added --linkcolor option (JM and Dan Robins)
afbab8a @jgm added error checking for presence of index.html
authored
20 # + added error checking: error exit if index.html not found
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
21 # + fixed '--version' and adjusted '--help' output
e916cb0 @jgm changed version to 0.6
authored
22 # sep-offprint 0.6 - John MacFarlane - August 30, 2006
fc5066a @jgm version bump
authored
23 # sep-offprint 0.5 - John MacFarlane - August 25, 2006
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
24 # sep-offprint 0.4 - John MacFarlane - August 22, 2006
54e85fa @jgm initial commit
authored
25 # sep-offprint 0.3 - John MacFarlane - May 25, 2005
26 #
62616b7 @jgm proper order of sections, notes at end
authored
27 # Synopsis:
28 #
54e85fa @jgm initial commit
authored
29 # produces a PDF or postscript "offprint" of a Stanford
30 # Encyclopedia of Philosophy (SEP) article
31 #
32 # Argument is an entry name from SEP, as it appears in the URL.
33 # For example, to get the article on classical logic, which is at
34 # http://plato.stanford.edu/entries/logic-classical/, just type
35 #
36 # perl sep-offprint logic-classical
37 #
f0bf547 @jgm Bug fixes due to Dan Robins
authored
38 # and it will create logic-classical.pdf.
54e85fa @jgm initial commit
authored
39 #
40 # There are many command-line options. For a list, type
41 #
f0bf547 @jgm Bug fixes due to Dan Robins
authored
42 # perl sep-offprint --help
54e85fa @jgm initial commit
authored
43 #
b294df2 @jgm use lwp-rget instead of wget
authored
44 # The programs html2ps and ps2pdf must be in the user's path:
54e85fa @jgm initial commit
authored
45 #
46 # html2ps can be found at http://user.it.uu.se/~jan/html2ps.html.
47 # Download the tarball or zip file and run the "install" script.
48 #
49 # ps2pdf is part of Ghostscript -- many users will have it
50 # already: http://www.cs.wisc.edu/~ghost/doc/AFPL/get851.htm
51 #
b294df2 @jgm use lwp-rget instead of wget
authored
52 # In addition, the LWP package for Perl must be installed.
53 #
54e85fa @jgm initial commit
authored
54 # For more information and updates, see
55 # http://philosophy.berkeley.edu/macfarlane/sep-offprint.html
56
62616b7 @jgm proper order of sections, notes at end
authored
57 my $version_number = '1.0';
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
58
54e85fa @jgm initial commit
authored
59 use Getopt::Long;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
60 use File::Temp qw/ tempdir /;
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
61 use File::Copy;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
62 use Cwd;
54e85fa @jgm initial commit
authored
63
9e79250 @jgm print completion messages to STDERR
authored
64 # printhelp - returns a usage message
62616b7 @jgm proper order of sections, notes at end
authored
65
54e85fa @jgm initial commit
authored
66 sub printhelp {
67 die
68 "Produces a PDF offprint from a Stanford Encyclopedia of Philosophy article.
69 (http://plato.stanford.edu/)
70
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
71 Usage: sep-offprint [options] <entry name>
54e85fa @jgm initial commit
authored
72
73 Examples: sep-offprint russell
74 sep-offprint --1up --ps --paper a4 frege
75
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
76 Options (* indicates a default):
54e85fa @jgm initial commit
authored
77
78 --1up print one page per sheet, portrait orientation
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
79 --2up print two pages per sheet, landscape orientation*
54e85fa @jgm initial commit
authored
80 --ps produce postscript (PS) output
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
81 --pdf produce PDF output*
fee5a00 @jgm added --output option
authored
82 --output <filename> name of output file (defaults to <entryname>.ps|pdf)
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
83 --font <font> use <font> (Times*, Helvetica, Palatino, Courier)
84 --size <size> use <size> (10pt, 12pt, 14pt*, 16pt)
85 --align <align> use <align> (left, justified*)
86 --paper <papersize> specify <papersize> (letter*, legal, a4)
87 --linkcolor <color> specify color of hyperlinks (black*, gray, blue, ...)
54e85fa @jgm initial commit
authored
88 --help this message
89 --version prints version number\n";
90 }
91
62616b7 @jgm proper order of sections, notes at end
authored
92 # slurp - slurps contents of a file and returns as a string;
93 # takes filename as argument
94
95 sub slurp {
96 my $file = shift;
97 local( $/, *FILE );
98 open(FILE, "< $file") or die "Couldn't open $file to read";
99 my $contents = <FILE>;
100 close(FILE);
101 return $contents;
102 }
103
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
104 # uniq - remove duplicates from an array, preserving the order of the original
105
106 sub uniq {
107 my @in = @_;
108 undef %seen;
109 grep(!$seen{$_}++, @in);
110 }
111
62616b7 @jgm proper order of sections, notes at end
authored
112 # preprocess html - preprocess HTML file, stripping out navigation bars,
113 # etc., and replacing entity references with appropriate characters or images.
114 # takes filename as argument
115
116 sub preprocess_html {
117 my $file = $_;
118 my $contents = slurp $file;
119
120 # get rid of header stuff
121 $contents =~ s/<body>.*?<!--DO NOT MODIFY THIS LINE AND ABOVE-->/<body><div id="content"><div id="aueditable">/gs;
122
123 # get rid of "(Stanford Encyclopedia of Philosophy)" in title:
124 $contents =~ s/<title>(.*)\ \(Stanford Encyclopedia of Philosophy\)/<title>$1/;
125
126 # make publication date into regular paragraph
127 $contents =~ s/<br \/><span class="xsmall">(.*)<\/span><\/h1>/<\/h1><p>$1<\/p>/g;
128
129 # center copyright notice
130 $contents =~ s/<div id="foot">(.*?)<\/div>/<center>$1<\/center>/gs;
131
132 # replace unicode character references
133 %replacements = (
134 "&\#133;" => "&hellip;",
135 "&\#145;" => "&lsquo;",
136 "&\#146;" => "&rsquo;",
137 "&\#147;" => "&ldquo;",
138 "&\#148;" => "&rdquo;",
139 "&\#149;" => "&bull;",
140 "&\#150;" => "&minus;",
141 "&\#257;" => "a",
142 "&\#261;" => "a",
143 "&\#263;" => "c",
144 "&\#269;" => "c",
145 "&\#281;" => "e",
146 "&\#299;" => "i",
147 "&\#321;" => "L",
148 "&\#322;" => "l",
149 "&\#324;" => "n",
150 "&\#333;" => "o",
151 "&\#345;" => "r",
152 "&\#346;" => "S",
153 "&\#347;" => "s",
154 "&\#351;" => "s",
155 "&\#363;" => "u",
156 "&\#365;" => "u",
157 "&\#369;" => "u",
158 "&\#378;" => "z",
159 "&\#380;" => "z",
160 "&\#381;" => "Z",
161 "&\#599;" => "u",
162 "&\#768;" => "",
163 "&\#769;" => "",
164 "&\#770;" => "",
165 "&\#771;" => "",
166 "&\#772;" => "",
167 "&\#773;" => "",
168 "&\#775;" => "",
169 "&\#803;" => "",
170 "&\#8209;" => "-",
171 "&\#8600;" => "<img alt=\"southeast-arrow\" src=\"http:\/\/plato.stanford.edu\/symbols\/searrow.gif\">",
172 "<sup>&\#9484;<\/sup>" => "<img alt=\"left-corner-quote\" src=\"http:\/\/plato.stanford.edu\/symbols\/l-corner-quote.gif\">",
173 "<sup>&\#9488;<\/sup>" => "<img alt=\"right-corner-quote\" src=\"http:\/\/plato.stanford.edu\/symbols\/r-corner-quote.gif\">",
174 "&\#8463;" => "<img alt=\"hbar\" src=\"http:\/\/plato.stanford.edu\/symbols\/hbar.gif\">",
175 "&\#9633;" => "<img alt=\"Box\" src=\"http:\/\/plato.stanford.edu\/symbols\/Box.gif\">"
176 );
177 while ( my ($ref, $rep) = each(%replacements) ) {
178 $contents =~ s/$ref/$rep/g;
179 }
180
181 # write back to file
182 open(FILE, "> $file") or die "Couldn't open $file to write";
183 print FILE $contents;
184 close(FILE);
185 }
186
187 #
188 # parse command-line options
189 #
190
54e85fa @jgm initial commit
authored
191 GetOptions( '1up|1' => \$oneup,
192 '2up|2' => \$twoup,
193 'ps' => \$ps,
194 'pdf' => \$pdf,
fee5a00 @jgm added --output option
authored
195 'output|o=s' => \$outfile,
54e85fa @jgm initial commit
authored
196 'font=s' => \$fontfamily,
197 'size=s' => \$fontsize,
198 'align=s' => \$textalign,
199 'paper=s' => \$papersize,
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
200 'linkcolor=s' => \$linkcolor,
54e85fa @jgm initial commit
authored
201 'help|h' => \$help,
202 'version|v' => \$version);
203
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
204 if ($version) {die "sep-offprint $version_number\n";};
205
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
206 if ($#ARGV < 0) {&printhelp;};
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
207 $sourceArg = $ARGV[0];
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
208
862b5dd @jgm fixed bug when URL has trailing slash
authored
209 # remove trailing slash, if any, from sourceArg:
210 $sourceArg =~ s{/$}{};
211
9e79250 @jgm print completion messages to STDERR
authored
212 # derive entry name from argument:
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
213 $entryname = $sourceArg;
214
62616b7 @jgm proper order of sections, notes at end
authored
215 # remove uppercase and spaces
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
216 $entryname =~ tr/A-Z/a-z/;
217 $entryname =~ tr/ /-/;
218
62616b7 @jgm proper order of sections, notes at end
authored
219 # remove /index.html if specified
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
220 $entryname =~ s{/index\.html$}{};
221
222 # remove URL prefix (everything before slash)
223 $entryname =~ s{.*/}{};
224
225 if ($sourceArg =~ /^file:/) {
226 $source = $sourceArg; # file URL was specified - use local source
227 }
228 else {
229 $source = "http://plato.stanford.edu/entries/$entryname/";
230 }
231 $footer = $source;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
232
54e85fa @jgm initial commit
authored
233 if ($help) {&printhelp;};
234 if (not ($pdf or $ps)) {$pdf=1};
235 if ($oneup) {$twoup = 0} else {$twoup = 1};
236 if (not $fontsize) {$fontsize = "14pt"};
fee5a00 @jgm added --output option
authored
237 if (not $outfile) {$outfile = $entryname} else {$outfile =~ s/\.[^.]*$//};
54e85fa @jgm initial commit
authored
238 if (not $fontfamily) {$fontfamily = "Times"};
239 if (not $textalign) {$textalign = "justify"};
240 if (not $papersize) {$papersize = "letter"};
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
241 if (not $linkcolor) {$linkcolor = "black"};
54e85fa @jgm initial commit
authored
242
62616b7 @jgm proper order of sections, notes at end
authored
243 # create temporary directory
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
244 $temp = tempdir ( CLEANUP => 1 );
62616b7 @jgm proper order of sections, notes at end
authored
245 $current = getcwd; # working directory from which sep-offprint is run
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
246
62616b7 @jgm proper order of sections, notes at end
authored
247 # get all the source files and put them in temp directory,
248 # then change to temp directory
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
249
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
250 chdir $temp;
251 # download all the HTML files
252 print STDERR "Retrieving files...\n";
253 $downloadedFiles = `lwp-rget --limit=200 $source/index.html 2>&1`;
254 (-e "index.html") or die "Could not retrieve files from $source\nAre you sure you have the right entry name?\n";
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
255
62616b7 @jgm proper order of sections, notes at end
authored
256 # create blank html file to work around html2ps bug.
257 # without this blank file after notes.html, html2ps will cut off
54e85fa @jgm initial commit
authored
258 # the last page of an entry if it occurs in the left column in 2up mode.
259
f0bf547 @jgm Bug fixes due to Dan Robins
authored
260 $blank = "blankpage";
54e85fa @jgm initial commit
authored
261
262 open FILE, ">$blank" or die "unable to open $blank: $!";
263
264 print FILE <<EOF;
265 <html>
266 <head>
267 <title>&nbsp;</title>
268 </head>
269 <body>
270 <p>&nbsp;</p>
271 </body>
272 </html>
273 EOF
274
275 close FILE;
276
62616b7 @jgm proper order of sections, notes at end
authored
277 # create a configuration file with appropriate footers
54e85fa @jgm initial commit
authored
278
62616b7 @jgm proper order of sections, notes at end
authored
279 $html2psrc = "html2psrc";
54e85fa @jgm initial commit
authored
280
281 open FILE, ">$html2psrc" or die "unable to open $html2psrc: $!";
282
283 print FILE <<EOF;
284 BODY {
285 font-size: $fontsize;
286 font-family: $fontfamily;
287 text-align: $textalign;
288 }
f0bf547 @jgm Bug fixes due to Dan Robins
authored
289 A:link {
6ea32ff @jgm added --linkcolor option, fixed --version and --help
authored
290 color: $linkcolor;
f0bf547 @jgm Bug fixes due to Dan Robins
authored
291 }
54e85fa @jgm initial commit
authored
292 \@page {
293 margin-left: 2.5cm;
294 margin-right: 2.5cm;
295 margin-top: 2.5cm;
296 margin-bottom: 2.5cm;
297 }
298 \@html2ps {
299 option {
300 twoup: $twoup;
301 landscape: $twoup;
302 number: 0;
303 }
304 paper { type: $papersize }
305 header {
306 right: "STANFORD ENCYCLOPEDIA OF PHILOSOPHY";
307 left: \$T;
308 }
309 footer {
310 left: \$N;
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
311 right: $footer;
54e85fa @jgm initial commit
authored
312 }
313 }
314 EOF
315
316 close FILE;
317
318 # name of temporary file to hold postscript output of html2ps
4e7bee9 @jgm use File::Copy instead of cp; other minor improvements
authored
319 $pstemp = "pstemp";
54e85fa @jgm initial commit
authored
320
62616b7 @jgm proper order of sections, notes at end
authored
321 # preprocess all the html files in the working (i.e., temp) directory
322 preprocess_html foreach <*.html>;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
323
62616b7 @jgm proper order of sections, notes at end
authored
324 #
325 # determine the order in which the HTML pages should be processed:
326 #
d4664e0 @jgm center copyright notice
authored
327
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
328 @htmlFiles = $downloadedFiles =~ /^.*\.html$/gim;
62616b7 @jgm proper order of sections, notes at end
authored
329
330 # make a space-separated list of the HTML files to process, in order
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
331 my $orderedHtmlFiles = join(' ', @htmlFiles);
62616b7 @jgm proper order of sections, notes at end
authored
332
333 # set $notes to "notes.html" if there are notes
334 my $notes = "";
335 if ($orderedHtmlFiles =~ /notes\.html/) {
336 $notes = "notes.html"
337 }
338
339 # discard index.html and notes.html from the list
340 $orderedHtmlFiles =~ s/(index|notes)\.html//g;
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
341
fb1d758 @jgm fixed bugs in ordering; limit 0 on lwp-rget; removed --localpath
authored
342 print STDERR "Creating offprint...\n";
343
62616b7 @jgm proper order of sections, notes at end
authored
344 # call html2ps to create the postscript version of the entry
345 system("html2ps -D -U -f $html2psrc -o $pstemp index.html " . $orderedHtmlFiles . " $notes $blank");
c71db0e @jgm remove header; use local copy; change entities to pictures when needed
authored
346
62616b7 @jgm proper order of sections, notes at end
authored
347 # create pdf if requested
fee5a00 @jgm added --output option
authored
348 if ($pdf) {system("ps2pdf -sPAPERSIZE=$papersize $pstemp $current/$outfile.pdf") || print "Created $outfile.pdf\n";};
54e85fa @jgm initial commit
authored
349
62616b7 @jgm proper order of sections, notes at end
authored
350 # copy ps file if requested
fee5a00 @jgm added --output option
authored
351 if ($ps) {copy($pstemp, "$current/$outfile.ps") && print "Created $outfile.ps\n";};
54e85fa @jgm initial commit
authored
352
b2fde1b @jgm cleanup on sep-offprint
authored
353 # note: temporary directory will be deleted automatically on exit
Something went wrong with that request. Please try again.