internetarchive · jkafader · Jan 14, 2020 · Jun 7, 2019 · Jun 7, 2019 · Jun 11, 2019
diff --git a/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java b/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java
@@ -6,7 +6,7 @@
 import java.util.Map;
 
 import org.archive.crawler.restlet.XmlMarshaller;
-import org.archive.modules.writer.WARCWriterProcessor;
+import org.archive.modules.writer.BaseWARCWriterProcessor;
 import org.archive.util.ArchiveUtils;
 
 public class XmlCrawlSummaryReport extends Report {
@@ -28,7 +28,7 @@ public void write(PrintWriter writer, StatisticsTracker stats) {
         CrawlStatSnapshot snapshot = stats.getLastSnapshot();
 
         info.put("crawlName", 
-                ((WARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix());
+                ((BaseWARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix());
         info.put("crawlJobShortName", 
                 stats.getCrawlController().getMetadata().getJobName());
         info.put("scheduledDate", this.scheduledDate);

diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java
@@ -19,11 +19,14 @@
 
 package org.archive.modules.extractor;
 
+import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO;
+
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
+import java.io.StringWriter;
+import java.net.URI;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
@@ -34,28 +37,52 @@
 
 import org.apache.commons.httpclient.URIException;
 import org.archive.crawler.reporting.CrawlerLoggerModule;
+import org.archive.format.warc.WARCConstants.WARCRecordType;
+import org.archive.io.warc.WARCRecordInfo;
 import org.archive.modules.CoreAttributeConstants;
 import org.archive.modules.CrawlURI;
+import org.archive.modules.warc.BaseWARCRecordBuilder;
+import org.archive.modules.warc.WARCRecordBuilder;
 import org.archive.net.UURI;
 import org.archive.net.UURIFactory;
 import org.archive.util.ArchiveUtils;
 import org.archive.util.MimetypeUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.context.Lifecycle;
 
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
 import com.google.gson.JsonObject;
 import com.google.gson.JsonParseException;
 import com.google.gson.JsonStreamParser;
+import com.google.gson.internal.Streams;
+import com.google.gson.stream.JsonWriter;
 
 /**
  * Extracts links to media by running youtube-dl in a subprocess. Runs only on
  * html.
  *
  * <p>
+ * Also implements {@link WARCRecordBuilder} to write youtube-dl json to the
+ * warc.
+ * 
+ * <p>
+ * To use <code>ExtractorYoutubeDL</code>, add this top-level bean:
+ * 
+ * <pre>
+ * &lt;bean id="extractorYoutubeDL" class="org.archive.modules.extractor.ExtractorYoutubeDL"/&gt;
+ * </pre>
+ * 
+ * Then add <code>&lt;ref bean="extractorYoutubeDL"/&gt;</code> to end of the
+ * fetch chain, and to the end of the warc writer chain.
+ * 
+ * <p>
  * Keeps a log of containing pages and media captured as a result of youtube-dl
  * extraction. The format of the log is as follows:
  *
- * <pre>[timestamp] [media-http-status] [media-length] [media-mimetype] [media-digest] [media-timestamp] [media-url] [annotation] [containing-page-digest] [containing-page-timestamp] [containing-page-url] [seed-url]</pre>
+ * <pre>
+ * [timestamp] [media-http-status] [media-length] [media-mimetype] [media-digest] [media-timestamp] [media-url] [annotation] [containing-page-digest] [containing-page-timestamp] [containing-page-url] [seed-url]
+ * </pre>
  *
  * <p>
  * For containing pages, all of the {@code media-*} fields have the value
@@ -71,14 +98,17 @@
  *
  * @author nlevitt
  */
-public class ExtractorYoutubeDL extends Extractor implements Lifecycle {
+public class ExtractorYoutubeDL extends Extractor
+        implements Lifecycle, WARCRecordBuilder {
     private static Logger logger =
             Logger.getLogger(ExtractorYoutubeDL.class.getName());
 
     protected static final String YDL_CONTAINING_PAGE_DIGEST = "ydl-containing-page-digest";
     protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp";
     protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri";
 
+    protected static final int MAX_VIDEOS_PER_PAGE = 1000;
+
     protected transient Logger ydlLogger = null;
 
     protected CrawlerLoggerModule crawlerLoggerModule;
@@ -137,15 +167,21 @@ protected void extract(CrawlURI uri) {
                 logCapturedVideo(uri, ydlAnnotation);
             }
         } else {
-            List<JsonObject> ydlJsons = runYoutubeDL(uri);
-            if (ydlJsons != null && !ydlJsons.isEmpty()) {
-                for (JsonObject json: ydlJsons) {
+            JsonObject ydlJson = runYoutubeDL(uri);
+            if (ydlJson != null && ydlJson.has("entries")) {
+                JsonArray jsonEntries = ydlJson.getAsJsonArray("entries");
+                for (JsonElement e: jsonEntries) {
+                    JsonObject json = (JsonObject) e;
                     if (json.get("url") != null) {
                         String videoUrl = json.get("url").getAsString();
                         addVideoOutlink(uri, json, videoUrl);
                     }
                 }
-                String annotation = "youtube-dl:" + ydlJsons.size();
+
+                // XXX this can be large, consider using a RecordingOutputStream
+                uri.getData().put("ydlJson", ydlJson);
+
+                String annotation = "youtube-dl:" + jsonEntries.size();
                 uri.getAnnotations().add(annotation);
                 logContainingPage(uri, annotation);
             }
@@ -290,13 +326,7 @@ public String call() throws IOException {
         return output;
     }
 
-    /**
-     *
-     * @param uri
-     * @return list of json blobs returned by {@code youtube-dl --dump-json}, or
-     *         empty list if no videos found, or failure
-     */
-    protected List<JsonObject> runYoutubeDL(CrawlURI uri) {
+    protected JsonObject runYoutubeDL(CrawlURI uri) {
         /*
          * --format=best
          *
@@ -305,7 +335,8 @@ protected List<JsonObject> runYoutubeDL(CrawlURI uri) {
          * https://github.com/ytdl-org/youtube-dl/blob/master/README.md#format-selection
          */
         ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config",
-                "--simulate", "--dump-json", "--format=best", uri.toString());
+                "--simulate", "--dump-single-json", "--format=best",
+                "--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString());
         logger.fine("running " + pb.command());
 
         Process proc = null;
@@ -344,11 +375,11 @@ protected List<JsonObject> runYoutubeDL(CrawlURI uri) {
             proc.destroyForcibly();
         }
 
-        List<JsonObject> ydlJsons = new ArrayList<JsonObject>();
         JsonStreamParser parser = new JsonStreamParser(output.stdout);
+        JsonObject ydlJson = null;
         try {
-            while (parser.hasNext()) {
-                ydlJsons.add((JsonObject) parser.next());
+            if (parser.hasNext()) {
+                ydlJson  = (JsonObject) parser.next();
             }
         } catch (JsonParseException e) {
             // sometimes we get no output at all from youtube-dl, which
@@ -361,7 +392,7 @@ protected List<JsonObject> runYoutubeDL(CrawlURI uri) {
             return null;
         }
 
-        return ydlJsons;
+        return ydlJson;
     }
 
     @Override
@@ -401,4 +432,40 @@ protected boolean shouldExtract(CrawlURI uri) {
 
         return false;
     }
+
+    @Override
+    public boolean shouldBuildRecord(CrawlURI curi) {
+        return curi.containsDataKey("ydlJson");
+    }
+
+    @Override
+    public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
+            throws IOException {
+        final String timestamp =
+                ArchiveUtils.getLog14Date(curi.getFetchBeginTime());
+
+        WARCRecordInfo recordInfo = new WARCRecordInfo();
+        recordInfo.setType(WARCRecordType.metadata);
+        recordInfo.setRecordId(BaseWARCRecordBuilder.generateRecordID());
+        if (concurrentTo != null) {
+            recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO,
+                    "<" + concurrentTo + ">");
+        }
+        recordInfo.setUrl("youtube-dl:" + curi);
+        recordInfo.setCreate14DigitDate(timestamp);
+        recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8");
+        recordInfo.setEnforceLength(true);
+
+        JsonObject ydlJson = (JsonObject) curi.getData().get("ydlJson");
+        StringWriter stringWriter = new StringWriter();
+        JsonWriter jsonWriter = new JsonWriter(stringWriter);
+        jsonWriter.setIndent(" ");
+        Streams.write(ydlJson, jsonWriter);
+
+        byte[] b = stringWriter.toString().getBytes("UTF-8");
+        recordInfo.setContentStream(new ByteArrayInputStream(b));
+        recordInfo.setContentLength((long) b.length);
+
+        return recordInfo;
+    }
 }
diff --git a/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java b/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java
@@ -18,17 +18,18 @@
  */
 package org.archive.modules.postprocessor;
 
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.logging.Logger;
-import java.util.ArrayList;
-import java.util.List;
+
 import org.archive.crawler.framework.CrawlController;
 import org.archive.crawler.framework.CrawlStatus;
 import org.archive.modules.CrawlURI;
 import org.archive.modules.Processor;
-import org.archive.modules.writer.WARCWriterProcessor;
+import org.archive.modules.writer.BaseWARCWriterProcessor;
 import org.springframework.beans.factory.annotation.Autowired;
 
 public class WARCLimitEnforcer extends Processor {
@@ -38,7 +39,7 @@ public class WARCLimitEnforcer extends Processor {
 
     protected Map<String, Map<String, Long>> limits = new HashMap<String, Map<String, Long>>();
     /**
-     * Should match structure of {@link WARCWriterProcessor#getStats()}
+     * Should match structure of {@link BaseWARCWriterProcessor#getStats()}
      * @param limits
      */
     public void setLimits(Map<String, Map<String, Long>> limits) {
@@ -48,23 +49,23 @@ public Map<String, Map<String, Long>> getLimits() {
         return limits;
     }
 
-    protected WARCWriterProcessor warcWriter;
+    protected BaseWARCWriterProcessor warcWriter;
     @Autowired
-    public void setWarcWriter(WARCWriterProcessor warcWriter) {
+    public void setWarcWriter(BaseWARCWriterProcessor warcWriter) {
         this.warcWriter = warcWriter;
     }
-    public WARCWriterProcessor getWarcWriter() {
+    public BaseWARCWriterProcessor getWarcWriter() {
         return warcWriter;
     }
 
     {
-        setWarcWriters(new ArrayList<WARCWriterProcessor>());
+        setWarcWriters(new ArrayList<BaseWARCWriterProcessor>());
     }
     @SuppressWarnings("unchecked")
-    public List<WARCWriterProcessor> getWarcWriters() {
-        return (List<WARCWriterProcessor>) kp.get("warcWriters");
+    public List<BaseWARCWriterProcessor> getWarcWriters() {
+        return (List<BaseWARCWriterProcessor>) kp.get("warcWriters");
     }
-    public void setWarcWriters(List<WARCWriterProcessor> warcWriters) {
+    public void setWarcWriters(List<BaseWARCWriterProcessor> warcWriters) {
         kp.put("warcWriters", warcWriters);
     }
 
@@ -91,7 +92,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException {
                 AtomicLong value = null;
                 if(getWarcWriters() !=null && getWarcWriters().size()>0) {
                     value = new AtomicLong(0);
-                    for (WARCWriterProcessor w: getWarcWriters()) {
+                    for (BaseWARCWriterProcessor w: getWarcWriters()) {
                         Map<String, AtomicLong> valueBucket = w.getStats().get(j);
                         if(valueBucket != null) {
                             value.set(value.addAndGet(valueBucket.get(k).get()));

diff --git a/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java b/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java
@@ -17,7 +17,7 @@
 
 import org.archive.crawler.event.CrawlStateEvent;
 import org.archive.modules.CrawlURI;
-import org.archive.modules.writer.WARCWriterProcessor;
+import org.archive.modules.writer.WARCWriterChainProcessor;
 import org.archive.spring.HasKeyedProperties;
 import org.archive.spring.KeyedProperties;
 import org.archive.trough.TroughClient;
@@ -31,7 +31,7 @@
  * <p>To use, define a {@code TroughContentDigestHistory} top-level bean in your
  * crawler-beans.cxml, then add {@link ContentDigestHistoryLoader} and
  * {@link ContentDigestHistoryStorer} to your fetch chain, sandwiching the
- * {@link WARCWriterProcessor}. In other words, follow the directions at
+ * {@link WARCWriterChainProcessor}. In other words, follow the directions at
  * <a href="https://github.com/internetarchive/heritrix3/wiki/Duplication%20Reduction%20Processors">https://github.com/internetarchive/heritrix3/wiki/Duplication%20Reduction%20Processors</a>
  * but replace the {@link BdbContentDigestHistory} bean with a
  * {@code TroughContentDigestHistory} bean.

diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
@@ -334,7 +334,7 @@ http://example.example/example
 
  <!-- DISPOSITION CHAIN -->
  <!-- first, processors are declared as top-level named beans  -->
- <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
+ <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterChainProcessor">
   <!-- <property name="compress" value="true" /> -->
   <!-- <property name="prefix" value="IAH" /> -->
   <!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
@@ -349,11 +349,21 @@ http://example.example/example
         </list>
        </property> -->
   <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
-  <!-- <property name="writeRequests" value="true" /> -->
-  <!-- <property name="writeMetadata" value="true" /> -->
-  <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
-  <!-- <property name="writeRevisitForNotModified" value="true" /> -->
   <!-- <property name="startNewFilesOnCheckpoint" value="true" /> -->
+  <!--
+  <property name="chain">
+   <list>
+    <bean class="org.archive.modules.warc.DnsResponseRecordBuilder"/>
+    <bean class="org.archive.modules.warc.HttpResponseRecordBuilder"/>
+    <bean class="org.archive.modules.warc.WhoisResponseRecordBuilder"/>
+    <bean class="org.archive.modules.warc.FtpControlConversationRecordBuilder"/>
+    <bean class="org.archive.modules.warc.FtpResponseRecordBuilder"/>
+    <bean class="org.archive.modules.warc.RevisitRecordBuilder"/>
+    <bean class="org.archive.modules.warc.HttpRequestRecordBuilder"/>
+    <bean class="org.archive.modules.warc.MetadataRecordBuilder"/>
+   </list>
+  </property>
+  -->
  </bean>
  <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
   <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->

diff --git a/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java
@@ -25,7 +25,7 @@ public class StatisticsSelfTest extends SelfTestBase {
 
     @Override
     protected String changeGlobalConfig(String config) {
-        String warcWriterConfig = " <bean id='warcWriter' class='org.archive.modules.writer.WARCWriterProcessor'/>\n";
+        String warcWriterConfig = " <bean id='warcWriter' class='org.archive.modules.writer.WARCWriterChainProcessor'/>\n";
         config = config.replace("<!--@@MORE_EXTRACTORS@@-->", warcWriterConfig);
         return super.changeGlobalConfig(config);
     }