In [ ]:
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._

import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

//import org.apache.spark.mllib.feature.{HashingTF, IDF}
//import org.apache.spark.mllib.feature.HashingTF._
//import org.apache.spark.mllib.feature.IDF._

//import org.apache.spark.mllib.linalg.Vectors
//import org.apache.spark.mllib.linalg.Vector
//import org.apache.spark.sql.Row

//import org.apache.spark.sql.SQLContext // test

import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.io._
import java.nio.file.{Files, Path, Paths}

//import opennlp.tools.langdetect._ // custom implementation
import opennlp.tools.lemmatizer.DictionaryLemmatizer
import opennlp.tools.postag.{POSModel, POSTaggerME}
import opennlp.tools.tokenize.{TokenizerME, TokenizerModel}

object Helpers {
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } finally {
            resource.close()
        }
}

import Helpers._

val de : String = "de"
val en : String = "en"
val fr : String = "fr"
val langs : List[String] = List(de, en, fr)

// since sparkContext is unavailable as var here, I can't just use it like in "Spark Dataset 101", "Spark 101"
// could be related to customDepts bug, because I added opennlp dependency:
// https://github.com/spark-notebook/spark-notebook/issues/563

val spark = SparkSession
  .builder()
  .appName("words")
  .config("spark.driver.allowMultipleContexts", "true")
  .master("local")
  .getOrCreate()
val sparkContext = spark.sparkContext

/*
Before I googled Apache OpenNLP, I implemented custom language recognizer based on -stopwords.txt.
Since some external libs are using dictionary approach anyway (https://github.com/optimaize/language-detector):
stopwords are commonly found in the speech,
stopwords dictionary is relatively small and stopwords of 3 langs provided differ a lot.
*/
def detectLang(line : String, dicts : scala.collection.mutable.Map[String, List[String]]) : Option[String] = {
    val langs = line.split(" ").flatMap(item => dicts.filter(_._2.exists(_.equalsIgnoreCase(item))).map(_._1))
                    .groupBy(f => f)
                    .map(g => (g._1, g._2.size))
    if(langs.isEmpty) None
    else Some(langs.maxBy(_._2)._1)
}

val stopwordsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty 
// stopwords per each lang, are all stored in RAM, because used for all supported languages detection & they're relatively small, quick to read.  

for(lang <- langs) { // main loop. TODO: refactor into function and call it async
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
    for (line <- source.getLines) {
      val list = stopwordsPerLang.getOrElse(lang, List.empty)
      stopwordsPerLang.update(lang, list:+line)  
    }
  }
}

val textfilesPathsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty
val textfiles = Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
                     .filter(_.getFileName.toString.endsWith(".txt"))

for(filePath <- textfiles) {
  val file = filePath.toString
  using(scala.io.Source.fromFile(file)) { source => 
    val firstLine = source.getLines.next() // get single first line & detect language with it, TODO: better to use a few random lines in the middle of the text
    detectLang(firstLine, stopwordsPerLang) match  {
      case Some(lang) => {
        var list = textfilesPathsPerLang.getOrElse(lang, List.empty)
        textfilesPathsPerLang.update(lang, list:+file)        
      }
      case None => println("Language was not detected for file: $file")
    }                                              
  }
}

def removeTextNoise(text:String) : String = {
  val removedNumbers = text.filter(!_.isDigit)
  // https://stackoverflow.com/questions/30074109/removing-punctuation-marks-form-text-in-scala-spark
  val removedWordsOfSizeLessEqual2AndPunctuation = removedNumbers.replaceAll("""([\p{Punct}]|\b\p{IsLetter}{1,2}\b)\s*""", " ")
  // https://stackoverflow.com/questions/6198986/how-can-i-replace-non-printable-unicode-characters-in-java
  val removedUnicodes = removedWordsOfSizeLessEqual2AndPunctuation.replaceAll("""[\p{C}]""", " ")
  val replacedEscapeSeqWithSpace =  removedUnicodes.replaceAll("""[\t\n\r\f\v]""", " ")
  replacedEscapeSeqWithSpace
}

class OpenNLP(val tokenizerModel: TokenizerModel, val posModel : POSModel, val lemmatizer : DictionaryLemmatizer) {
  def this(lang:String) = this(OpenNLP.loadTokenizerModel(lang), OpenNLP.loadPOSModel(lang), OpenNLP.loadLemmatizer(lang))

  val tokenizer = new TokenizerME(tokenizerModel)
  val posTagger = new POSTaggerME(posModel)

  def tokenize(text: String): Seq[String] = {
    val positions = tokenizer.tokenizePos(text)
    val strings = positions.map {
      pos => text.substring(pos.getStart, pos.getEnd)
    }
    strings.filter(_.length > 1).map(s => s.toLowerCase) // additional cleanup after regexps & to lower case
  }
  
  def lemmatize(tokens:Seq[String]): Seq[String] = {
    val tags = posTagger.tag(tokens.toArray)
    val lemmas = lemmatizer.lemmatize(tokens.toArray, tags)
    lemmas.toSeq
  }
}

object OpenNLP {
  def loadTokenizerModel(lang:String): TokenizerModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-token.bin")) { stream =>
      new TokenizerModel(stream)
    }
  }
  
  def loadPOSModel(lang:String): POSModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-pos-maxent.bin")) { stream =>
      new POSModel(stream)
    }
  }
  
  def loadLemmatizer(lang:String): DictionaryLemmatizer = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-lemmatizer.txt")) { stream =>
      new DictionaryLemmatizer(stream)
    }
  }
}

def removeStopWords(lang: String, tokens:Seq[String]) : Seq[String] = {
   tokens.filter(!stopwordsPerLang(lang).contains(_))
}

for ((lang,textsPaths) <- textfilesPathsPerLang) { // main loop. TODO: refactor into function and call it async
   //val lemmas = readLemmas(lang)
  val nlp = new OpenNLP(lang)
  
  for (path <- textsPaths) {
    using(scala.io.Source.fromFile(path)) { source => 
      val text = source.getLines.mkString
      val unnoisedText = removeTextNoise(text)                                      
      val tokens = nlp.tokenize(unnoisedText)
      val tokensExcludeStopWords = removeStopWords(lang, tokens)
      val df = spark.createDataFrame(Seq((0, tokensExcludeStopWords.toArray))).toDF("id", "words")
                         
      val model: CountVectorizerModel = new CountVectorizer()
          .setInputCol("words")
          .setOutputCol("features")
          .fit(df)
                                           
      for(item <- model.vocabulary.take(30)){
        print(item) 
        print(" ")
      }
      
    }
    break; // test single file
  }  
}

leverage money investment property times old can investments made leveraged financial owe like one way make land risky loaned lots find say market put might meltdown worth fashioned good close scala.util.control.BreakControl
