Permalink
Browse files

Start Project

Signed-off-by: John Holt <john.d.holt@lexisnexis.com>
  • Loading branch information...
1 parent 8797948 commit e796f7911eb351bc78636229d67b092e3701e5bf @johnholt johnholt committed Dec 5, 2011
Showing with 72 additions and 0 deletions.
  1. +6 −0 patentanalytics/InitialLoad/README
  2. +47 −0 patentanalytics/InitialLoad/Spray_Files.ecl
  3. +19 −0 patentanalytics/README
@@ -0,0 +1,6 @@
+InitialLoad
+
+Runnable attributes used in the initial load of the text documents
+downloaded. To run these on your cluser, you will need to change
+strings like the IP addresses and cluster names. You may want to
+change the dataset names as well.
@@ -0,0 +1,47 @@
+
+import std, std.File, std.Str;
+dirAlias := std.File.FsFilenameRecord;
+
+
+STRING landing_zone := '10.239.20.76';
+STRING directory := '/var/lib/HPCCSystems/dropzone/patent_data/';
+STRING cluster := 'mythor';
+
+dirList := std.File.RemoteDirectory(landing_zone, directory, '*.xml', FALSE);
+// dirList := std.File.RemoteDirectory(landing_zone, directory, '*.txt', FALSE);
+// dirList := std.File.RemoteDirectory(landing_zone, directory, '*.sgm', FALSE);
+
+SprayFunction(STRING fileName, STRING internalFileName) := FUNCTION
+ RETURN std.File.fSprayVariable( landing_zone , directory + filename , 16000000 ,
+ , , ,
+ cluster, internalFileName ,
+ , , ,
+ TRUE , TRUE , TRUE);
+ // RETURN internalFileName;
+END;
+
+
+SprayResult := RECORD,MAXLENGTH(2048)
+ STRING dfuWUID;
+ STRING externalName;
+ STRING internalName;
+ INTEGER size;
+END;
+
+SprayResult doSpray(dirAlias l) := TRANSFORM
+ STRING fname := l.name[1..LENGTH(l.name)-4];
+ STRING dsname := '~THOR::Patents::' + fname + '::XML';
+ SELF.externalName := l.name;
+ SELF.size := l.size;
+ SELF.internalName := dsname;
+ SELF.dfuWUID := SprayFunction(l.name, dsname);
+END;
+
+ds1 := dirList;
+
+OUTPUT(COUNT(ds1), NAMED('Files_2_Spray'));
+ds3 := NOTHOR(PROJECT(ds1, doSpray(LEFT)));
+
+// OUTPUT(ds1, NAMED('Directory_List'));
+// OUTPUT(ds3, NAMED('Spray_Log'));
+OUTPUT(ds3,,'~THOR::Patents::SPRAY_Log::' + WORKUNIT);
View
@@ -0,0 +1,19 @@
+The Patent-Analytics project is a demonstration of using the HPCC Systems
+HPCC platform to build an application to provide analysis of USPTO Patent
+filings.
+
+The data was obtaind by downloading the USPTO Patent Filings from the
+Google repository. See:
+ http://www.google.com/googlebooks/uspto-patents-grants-text.html
+ http://www.google.com/googlebooks/uspto-patents-grants-biblio.html
+The bibliography files are small and redundant, but they provide another
+list so that I can check for completeness.
+
+
+Optional early patents (back to 1921), estimate to be about 30 GBytes,
+data is not compressed. This is very dirty data, from a OCR of paper
+copies.
+http://www.google.com/googlebooks/uspto-patents-grants-ocr.html
+
+
+Currently using only the machine readable filings.

0 comments on commit e796f79

Please sign in to comment.