In [1]:
import de.l3s.archivespark._
import de.l3s.archivespark.implicits._
import de.l3s.archivespark.enrich._
import de.l3s.archivespark.enrich.functions._
import edu.harvard.countway.mhl.archivespark._
import edu.harvard.countway.mhl.archivespark.search._

In [2]:
val query = MhlSearchOptions(query = "polio", collections = MhlCollections.Statemedicalsocietyjournals)

In [3]:
val rdd = ArchiveSpark.load(sc, MhlSearchSpec(query))

In [4]:
val symptomSet = Seq("headache", "neck", "back", "abdominal", "extremity", "fever", "vomiting", "lethargy", "irritability")

In [5]:
val symptoms = LowerCase.map("symptoms") {text: String => symptomSet.filter(text.contains)}

In [6]:
val enriched = rdd.enrich(symptoms)

In [7]:
println(enriched.peekJson)

{
  "record":{
    "id":"journalofmedical2319alab",
    "title":"Journal of the Medical Association of the State of Alabama",
    "author":[
      "Alabama State Board of Health",
      "Medical Association of the State of Alabama",
      "Medical Association of the State of Alabama. Board of Censors",
      "Alabama. State Department of Health. Annual report"
    ],
    "date":"1954",
    "subject":[
      "Medicine;Periodicals;Alabama"
    ],
    "language":[
      "eng"
    ],
    "mediatype":[
      "texts"
    ],
    "collection":[
      "francisacountwaylibrary",
      "statemedicalsocietyjournals",
      "medicalheritagelibrary",
      "americana"
    ],
    "description":[
      "Description based on: Vol. 1, no. 11 (May 1932); title from caption",
      "Includes reports of the State Dept. of Health and rosters of the association's members",
      "Cumulative index to nursing & allied health literature",
      "Energy research abstracts",
      "Excerpta medica",
      "Hospit

In [8]:
val symptomCounts = enriched.flatMapValues(symptoms).countByValue

In [9]:
symptomCounts.foreach(println)

(headache,2042)
(abdominal,2042)
(neck,2039)
(vomiting,2037)
(extremity,1942)
(back,2050)
(lethargy,1529)
(fever,2047)
(irritability,1939)
