In [ ]:
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.nio.file.{Files, Path, Paths}

object Helpers {
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } finally {
            resource.close()
        }
}

import Helpers._

val de : String = "de"
val en : String = "en"
val fr : String = "fr"
val langs : List[String] = List(de, en, fr)

def detectLang(line : String, dicts : scala.collection.mutable.Map[String, List[String]]) : Option[String] = {
    val langs = line.split(" ").flatMap(item => dicts.filter(_._2.exists(_.equalsIgnoreCase(item))).map(_._1))
                    .groupBy(f => f)
                    .map(g => (g._1, g._2.size))
    if(langs.isEmpty) None
    else Some(langs.maxBy(_._2)._1)
}

val stopwordsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty 
// Stopwords per each lang, are all stored in RAM, because used for all supported languages detection & they're relatively small, quick to read.  

for(lang <- langs) { // main loop. TODO: refactor into function and call it async
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
    for (line <- source.getLines) {
      val list = stopwordsPerLang.getOrElse(lang, List.empty)
      stopwordsPerLang.update(lang, list:+line)  
    }
  }
}

val textfilesPathsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty
val textfiles = Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
                     .filter(_.getFileName.toString.endsWith(".txt"))

for(filePath <- textfiles) {
  val file = filePath.toString
  using(scala.io.Source.fromFile(file)) { source => 
    val firstLine = source.getLines.next() // get single first line & detect language with it, TODO: better to use a few random lines in the middle of the text
    detectLang(firstLine, stopwordsPerLang) match  {
      case Some(lang) => {
        var list = textfilesPathsPerLang.getOrElse(lang, List.empty)
        textfilesPathsPerLang.update(lang, list:+file)        
      }
      case None => println("Language was not detected for file: $file")
    }                                              
  }
}

case class Lemma(entry:String, default:String, pos:String)
object Lemma {
  def parse(line:String): Option[Lemma] = {
    line.split("\t") match { 
      case Array(f,s,t,_*) => Some(Lemma(f, s, t))
      case _ => None
    }
  }  
}

def removeTextNoise(text:String) : String = {
  val removedNumbers = text.filter(!_.isDigit)
  // https://stackoverflow.com/questions/30074109/removing-punctuation-marks-form-text-in-scala-spark
  val removedWordsOfSizeLessEqual2AndPunctuation = removedNumbers.replaceAll("""([\p{Punct}&&[^.]]|\b\p{IsLetter}{1,2}\b)\s*""", "")
  // https://stackoverflow.com/questions/6198986/how-can-i-replace-non-printable-unicode-characters-in-java
  val removedUnicodes = removedWordsOfSizeLessEqual2AndPunctuation.replaceAll("""[\p{C}]""", "")
  val replacedEscapeSeqWithSpace =  removedUnicodes.replaceAll("""[\t\n\r\f\v]""", " ")
  replacedEscapeSeqWithSpace
}

def removeStopWords() : String = {
  ""
}

for ((lang,textsPaths) <- textfilesPathsPerLang) { // main loop. TODO: refactor into function and call it async
     
  var lemmas : scala.collection.mutable.ListBuffer[Lemma] = scala.collection.mutable.ListBuffer.empty  
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-lemmatizer.txt")) { source => 
    for (line <- source.getLines) {
      Lemma.parse(line) match { 
        case Some(lemma) => lemmas+=lemma
        case _ =>;
      }      
    }
  }
  
  for (paths <- textsPaths) {
    using(scala.io.Source.fromFile(paths)) { source => 
      val text = source.getLines.mkString
    }
  }  
}

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.nio.file.{Files, Path, Paths}
defined object Helpers
import Helpers._
de: String = de
en: String = en
fr: String = fr
langs: List[String] = List(de, en, fr)
detectLang: (line: String, dicts: scala.collection.mutable.Map[String,List[String]])Option[String]
stopwordsPerLang: scala.collection.mutable.Map[String,List[String]] = Map(en -> List(a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, ...