In [ ]:
import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import scala.concurrent._
import ExecutionContext.Implicits.global

import java.io._
import java.nio.file.{Files, Path, Paths}

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}

//import opennlp.tools.langdetect._ // custom implementation
import opennlp.tools.lemmatizer.DictionaryLemmatizer
import opennlp.tools.postag.{POSModel, POSTaggerME}
import opennlp.tools.tokenize.{TokenizerME, TokenizerModel}

object Helpers {
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } catch {
            case _ : Throwable => throw new Exception("file exception")
        } finally {
            resource.close()
        }
}

import Helpers._
type MapLangToStrings = scala.collection.mutable.Map[String, List[String]]

// since sparkContext is unavailable as var here, I can't just use it like in "Spark Dataset 101", "Spark 101"
// could be related to customDepts bug, because I added opennlp dependency:
// https://github.com/spark-notebook/spark-notebook/issues/563
val spark = SparkSession
  .builder()
  .appName("words")
  .config("spark.driver.allowMultipleContexts", "true")
  .master("local")
  .getOrCreate()
val sparkContext = spark.sparkContext

object NLP {  
  def getLangs : Seq[String] = {
    val de : String = "de"
    val en : String = "en"
    val fr : String = "fr"
    Seq(de, en, fr)
  }
  
  def getStopwordsPerLang(langs : Seq[String]) : MapLangToStrings = {
    val stopwordsPerLang: MapLangToStrings = scala.collection.mutable.Map.empty 
    // stopwords per each lang, are all stored in RAM, because used for all supported languages detection & they're relatively small, quick to read.      

    for(lang <- langs) {
      using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
        for (line <- source.getLines) {
          val list = stopwordsPerLang.getOrElse(lang, List.empty)
          stopwordsPerLang.update(lang, list:+line)  
        }
      }
    }
    stopwordsPerLang
  }
  
  def getFilesPaths : Seq[String] = {
      Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
           .filter(_.getFileName.toString.endsWith(".txt"))
           .map(_.toString)
           .toSeq
  }
}

class NLP(val stopwordsPerLang: MapLangToStrings, val textfilesPaths: Seq[String]) {
  def this() = this(NLP.getStopwordsPerLang(NLP.getLangs), NLP.getFilesPaths)

  def process = {
    getFilePathsPerLang(textfilesPaths) foreach { case (lang, textPaths) => //Future {
        val onlp = new OpenNLP(lang)
  
        for (path <- textPaths) {
          using(scala.io.Source.fromFile(path)) { source =>
            val text = source.getLines.mkString
            val unnoisedText = removeTextNoise(text)
                                               
            val tokens = onlp.tokenize(unnoisedText)
            val tokensExcludeStopWords = removeStopWords(lang, tokens, stopwordsPerLang)

            val lemmas = onlp.lemmatize(tokensExcludeStopWords)
            val lemmd = (tokensExcludeStopWords zip lemmas) map (tuple => if(tuple._2 != "O") tuple._2 else tuple._1 ) // if no lemma => original
          
            // TODO: 
            val df = spark.createDataFrame(Seq((0, lemmd.toArray))).toDF("id", "words")                         
            //val vectorizer: CountVectorizerModel = 
            
            val tf = new CountVectorizer()
              .setInputCol("words")
              .setOutputCol("rawFeatures")
              .fit(df)
              .transform(df)
                                               
            //val featurizedData = vectorizer.vocabulary
            //for(item <- featurizedData) println(item)

            
            //val tf = vectorizer.transform(df)
            //vector.
            //for(item <- vector) println(item)
            //print(vector.shape)
            //print(type(vector))
            //print(vector.toArray())
            //print(vector)
            
            /*
            val sentenceData = spark.createDataFrame(Seq(
              (0.0, "Hi I heard about Spark"),
              (0.0, "I wish Java could use case classes"),
              (1.0, "Logistic regression models are neat")
            )).toDF("label", "sentence")
*/
            //val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
            //val wordsData = tokenizer.transform(sentenceData)

            //val hashingTF = new HashingTF()
              //.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)

            //val featurizedData = hashingTF.transform(wordsData)
            // alternatively, CountVectorizer can also be used to get term frequency vectors            
            
            val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features").fit(tf)

            val tfidf = idf.transform(tf)
            //tfidf.select("id", "features").show()
            for(item <- tfidf) println(item)
          }
          break; // test single file        
      }
      //}
    }
  }
  
  def removeTextNoise(text:String) : String = {
    val removedNumbers = text.filter(!_.isDigit)
    // https://stackoverflow.com/questions/30074109/removing-punctuation-marks-form-text-in-scala-spark
    val removedWordsOfSizeLessEqual2AndPunctuation = removedNumbers.replaceAll("""([\p{Punct}]|\b\p{IsLetter}{1,2}\b)\s*""", " ")
    // https://stackoverflow.com/questions/6198986/how-can-i-replace-non-printable-unicode-characters-in-java
    val removedUnicodes = removedWordsOfSizeLessEqual2AndPunctuation.replaceAll("""[\p{C}]""", " ")
    val replacedEscapeSeqWithSpace =  removedUnicodes.replaceAll("""[\t\n\r\f\v]""", " ")
    replacedEscapeSeqWithSpace
  }

  def removeStopWords(lang: String, tokens:Seq[String], stopwordsPerLang : MapLangToStrings) : Seq[String] = {
     tokens.filter(!stopwordsPerLang(lang).contains(_))
  }
  
  def getFilePathsPerLang(textfilePaths : Seq[String]) : MapLangToStrings = {
    val textfilesPathsPerLang: MapLangToStrings = scala.collection.mutable.Map.empty
  
    for(file <- textfilePaths) {
      using(scala.io.Source.fromFile(file)) { source => 
        val firstLine = source.getLines.next() // detect language with first line, TODO: use a few random lines in the middle of the text
        detectLang(firstLine, stopwordsPerLang) match  {
          case Some(lang) => {
            var list = textfilesPathsPerLang.getOrElse(lang, List.empty)
            textfilesPathsPerLang.update(lang, list:+file)        
          }
          case None => println("Language was not detected for file: $file")
        }                                              
      }
    }
    textfilesPathsPerLang
  } 
  
  /*
    Before I googled Apache OpenNLP, I implemented custom language recognizer based on -stopwords.txt.
    Since some external libs are using dictionary approach anyway (https://github.com/optimaize/language-detector):
    stopwords are commonly found in the speech,
    stopwords dictionary is relatively small and stopwords of 3 langs provided differ a lot.
  */
  def detectLang(line : String, stopwordsPerLang : MapLangToStrings) : Option[String] = {
    val langs = line.split(" ").flatMap(item => stopwordsPerLang.filter(_._2.exists(_.equalsIgnoreCase(item))).map(_._1))
                    .groupBy(f => f)
                    .map(g => (g._1, g._2.size))
    if(langs.isEmpty) None
    else Some(langs.maxBy(_._2)._1)
  } 
}

class OpenNLP(val tokenizerModel: TokenizerModel, val posModel : POSModel, val lemmatizer : DictionaryLemmatizer) {
  def this(lang:String) = this(OpenNLP.loadTokenizerModel(lang), OpenNLP.loadPOSModel(lang), OpenNLP.loadLemmatizer(lang))

  val tokenizer = new TokenizerME(tokenizerModel)
  val posTagger = new POSTaggerME(posModel)

  def tokenize(text: String): Seq[String] = {
    val positions = tokenizer.tokenizePos(text)
    val strings = positions.map {
      pos => text.substring(pos.getStart, pos.getEnd)
    }
    strings.filter(_.length > 1).map(s => s.toLowerCase) // additional cleanup after regexps & to lower case
  }
  
  def lemmatize(tokens:Seq[String]): Seq[String] = {
    val tags = posTagger.tag(tokens.toArray)
    lemmatizer.lemmatize(tokens.toArray, tags)
  }
}

object OpenNLP {
  def loadTokenizerModel(lang:String): TokenizerModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-token.bin")) { stream =>
      new TokenizerModel(stream)
    }
  }
  
  def loadPOSModel(lang:String): POSModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-pos-maxent.bin")) { stream =>
      new POSModel(stream)
    }
  }
  
  def loadLemmatizer(lang:String): DictionaryLemmatizer = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-lemmatizer-columns-reordered.txt")) { stream =>
      new DictionaryLemmatizer(stream)
    }
  }
}

val nlp = new NLP()
nlp.process

[0,WrappedArray(world, financial, system, come, close, complete, meltdown, one, want, repeat, terrible, crash, can, repeat, avoid, first, instance, need, understand, bring, brink, wrong, good, time, unwittingly, lead, cliff, edge, just, lead, meltdown, certainly, wasn, single, factor, blame, mid, financial, institution, putt, money, new, kind, risky, investment, investment, like, credit, default, swap, difficult, understand, new, york, time, call, arcane, one, article, particular, investment, little, important, factor, technique, call, leverage, use, make, investment, arguably, understand, leverage, key, understanding, meltdown, leverage, essence, just, refer, practice, borrow, money, make, investment, see, work, see, attractive, can, seem, extremely, risky, let, compare, leverage, old, fashioned, investment, say, invest, property, buy, land, worth, period, time, market, value, increase, make, bad, old, day, might, happy, nice, good, wouldn, leverage, economy, tick, quite, nicely, able