In [ ]:
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
//import scala.io.Source
import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.nio.file.{Files, Path, Paths}

object Helpers { // not much required for a script file
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } finally {
            resource.close()
        }
}

import Helpers._

val en : String = "en"
val de : String = "de"
val fr : String = "fr"
val langs : List[String] = List(en, de, fr)

case class Lemma(entry:String, default:String, pos:String)

object Lemma {
  def parse(line:String): Option[Lemma] = {
    line.split("\t") match { 
      case Array(f,s,t,_*) => Some(Lemma(f, s, t))
      case _ => None
    }
  }  
}

val lemmasPerLang: scala.collection.mutable.Map[String, List[Lemma]] = scala.collection.mutable.Map.empty
val stopwordsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty

for(lang <- langs) {
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-lemmatizer.txt")) { source => 
    for (line <- source.getLines) {
      val list = lemmasPerLang.getOrElse(lang, List.empty)
      Lemma.parse(line) match { 
        case Some(lemma) => lemmasPerLang.update(lang, list:+lemma)
        case _ =>;
      }      
    }
  }
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
    for (line <- source.getLines) {
      val list = stopwordsPerLang.getOrElse(lang, List.empty)
      stopwordsPerLang.update(lang, list:+line)  
    }
  }
}

for(item <- lemmasPerLang.keys) println(item)
//for (item <- stopwordsPerLang.keys ) println(item)

def detectLang(line : String) : String = { // match words from lemmatizer, TODO: Apache OpenNLP or Apache Tika
    line.split(" ").flatMap(item => lemmasPerLang.filter(_._2.exists(_.entry.equalsIgnoreCase(item))).map(_._1))
                   .groupBy(f => f)
                   .map(g => (g._1, g._2.size))
                   .maxBy(_._2)
                   ._1   
}

val textfilesPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty

val textfiles = Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
  .filter(_.getFileName.toString.endsWith(".txt"))

for(file <- textfiles) {
  using(scala.io.Source.fromFile(file.toString)) { source => 
    val firstLine = source.getLines.next() // get single first line & detect language with it, TODO: better to use a few random lines in the middle of the text
    val lang : String = detectLang(firstLine)                                                 
    var list = textfilesPerLang.getOrElse(lang, List.empty)
    textfilesPerLang.update(lang, list:+file.toString)                                              
  }
}

textfilesPerLang


