In [ ]:
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._

import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.io._
import java.nio.file.{Files, Path, Paths}

//import opennlp.tools.langdetect._ // custom implementation
import opennlp.tools.lemmatizer.DictionaryLemmatizer
import opennlp.tools.postag.{POSModel, POSTaggerME}
import opennlp.tools.tokenize.{TokenizerME, TokenizerModel}

object Helpers {
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } catch {
            case _ : Throwable => throw new Exception("file exception")
        } finally {
            resource.close()
        }
}

import Helpers._
type MapLangToStrings = scala.collection.mutable.Map[String, List[String]]

// since sparkContext is unavailable as var here, I can't just use it like in "Spark Dataset 101", "Spark 101"
// could be related to customDepts bug, because I added opennlp dependency:
// https://github.com/spark-notebook/spark-notebook/issues/563
val spark = SparkSession
  .builder()
  .appName("words")
  .config("spark.driver.allowMultipleContexts", "true")
  .master("local")
  .getOrCreate()
val sparkContext = spark.sparkContext

object NLP {  
  def getLangs : Seq[String] = {
    val de : String = "de"
    val en : String = "en"
    val fr : String = "fr"
    Seq(de, en, fr)
  }
  
  def getStopwordsPerLang(langs : Seq[String]) : MapLangToStrings = {
    val stopwordsPerLang: MapLangToStrings = scala.collection.mutable.Map.empty 
    // stopwords per each lang, are all stored in RAM, because used for all supported languages detection & they're relatively small, quick to read.      

    for(lang <- langs) {
      using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
        for (line <- source.getLines) {
          val list = stopwordsPerLang.getOrElse(lang, List.empty)
          stopwordsPerLang.update(lang, list:+line)  
        }
      }
    }
    stopwordsPerLang
  }
  
  def getFilesPaths : Seq[String] = {
      Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
           .filter(_.getFileName.toString.endsWith(".txt"))
           .map(_.toString)
           .toSeq
  }
}

class NLP(val stopwordsPerLang: MapLangToStrings, val textfilesPaths: Seq[String]) {
  def this() = this(NLP.getStopwordsPerLang(NLP.getLangs), NLP.getFilesPaths)

  def process = {  // main loop. TODO: refactor into function and call it async
    for ((lang,textsPaths) <- getFilePathsPerLang(textfilesPaths)) {
       //val lemmas = readLemmas(lang)
      val onlp = new OpenNLP(lang)
  
      for (path <- textsPaths) {
        using(scala.io.Source.fromFile(path)) { source => 
          val text = source.getLines.mkString
          val unnoisedText = removeTextNoise(text)
                                               
          val tokens = onlp.tokenize(unnoisedText)
          val tokensExcludeStopWords = removeStopWords(lang, tokens, stopwordsPerLang)

          val lemmas = onlp.lemmatize(tokensExcludeStopWords)
          val lemmd = (tokensExcludeStopWords zip lemmas).map(tuple => if(tuple._2 != "O") tuple._2 else tuple._1 ) // if no lemma => original
          
          val df = spark.createDataFrame(Seq((0, lemmd.toArray))).toDF("id", "words")                         
          val model: CountVectorizerModel = new CountVectorizer()
            .setInputCol("words")
            .setOutputCol("features")
            .fit(df)
                                               
          for(item <- model.vocabulary) println(item)
        }
        break; // test single file
      }  
    }
  }
  
  def removeTextNoise(text:String) : String = {
    val removedNumbers = text.filter(!_.isDigit)
    // https://stackoverflow.com/questions/30074109/removing-punctuation-marks-form-text-in-scala-spark
    val removedWordsOfSizeLessEqual2AndPunctuation = removedNumbers.replaceAll("""([\p{Punct}]|\b\p{IsLetter}{1,2}\b)\s*""", " ")
    // https://stackoverflow.com/questions/6198986/how-can-i-replace-non-printable-unicode-characters-in-java
    val removedUnicodes = removedWordsOfSizeLessEqual2AndPunctuation.replaceAll("""[\p{C}]""", " ")
    val replacedEscapeSeqWithSpace =  removedUnicodes.replaceAll("""[\t\n\r\f\v]""", " ")
    replacedEscapeSeqWithSpace
  }

  def removeStopWords(lang: String, tokens:Seq[String], stopwordsPerLang : MapLangToStrings) : Seq[String] = {
     tokens.filter(!stopwordsPerLang(lang).contains(_))
  }
  
  def getFilePathsPerLang(textfilePaths : Seq[String]) : MapLangToStrings = {
    val textfilesPathsPerLang: MapLangToStrings = scala.collection.mutable.Map.empty
  
    for(file <- textfilePaths) {
      using(scala.io.Source.fromFile(file)) { source => 
        val firstLine = source.getLines.next() // get single first line & detect language with it, TODO: better to use a few random lines in the middle of the text
        detectLang(firstLine, stopwordsPerLang) match  {
          case Some(lang) => {
            var list = textfilesPathsPerLang.getOrElse(lang, List.empty)
            textfilesPathsPerLang.update(lang, list:+file)        
          }
          case None => println("Language was not detected for file: $file")
        }                                              
      }
    }
    textfilesPathsPerLang
  } 
  
  /*
    Before I googled Apache OpenNLP, I implemented custom language recognizer based on -stopwords.txt.
    Since some external libs are using dictionary approach anyway (https://github.com/optimaize/language-detector):
    stopwords are commonly found in the speech,
    stopwords dictionary is relatively small and stopwords of 3 langs provided differ a lot.
  */
  def detectLang(line : String, stopwordsPerLang : MapLangToStrings) : Option[String] = {
    val langs = line.split(" ").flatMap(item => stopwordsPerLang.filter(_._2.exists(_.equalsIgnoreCase(item))).map(_._1))
                    .groupBy(f => f)
                    .map(g => (g._1, g._2.size))
    if(langs.isEmpty) None
    else Some(langs.maxBy(_._2)._1)
  } 
}

class OpenNLP(val tokenizerModel: TokenizerModel, val posModel : POSModel, val lemmatizer : DictionaryLemmatizer) {
  def this(lang:String) = this(OpenNLP.loadTokenizerModel(lang), OpenNLP.loadPOSModel(lang), OpenNLP.loadLemmatizer(lang))

  val tokenizer = new TokenizerME(tokenizerModel)
  val posTagger = new POSTaggerME(posModel)

  def tokenize(text: String): Seq[String] = {
    val positions = tokenizer.tokenizePos(text)
    val strings = positions.map {
      pos => text.substring(pos.getStart, pos.getEnd)
    }
    strings.filter(_.length > 1).map(s => s.toLowerCase) // additional cleanup after regexps & to lower case
  }
  
  def lemmatize(tokens:Seq[String]): Seq[String] = {
    val tags = posTagger.tag(tokens.toArray)
    lemmatizer.lemmatize(tokens.toArray, tags)
  }
}

object OpenNLP {
  def loadTokenizerModel(lang:String): TokenizerModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-token.bin")) { stream =>
      new TokenizerModel(stream)
    }
  }
  
  def loadPOSModel(lang:String): POSModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-pos-maxent.bin")) { stream =>
      new POSModel(stream)
    }
  }
  
  def loadLemmatizer(lang:String): DictionaryLemmatizer = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-lemmatizer-columns-reordered.txt")) { stream =>
      new DictionaryLemmatizer(stream)
    }
  }
}

val nlp = new NLP()
nlp.process
/*
import scala.io.Source
import java.io._
for (lang <- NLP.getLangs) {
      val writer = new PrintWriter(new File(s"notebooks/words/vocabs/$lang-lemmatizer-columns-reordered.txt"))
      using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-lemmatizer.txt")) { source => 
        for (line <- source.getLines) {
           val array = line.split("\t")
           writer.write(s"${array(0)}\t${array(2)}\t${array(1)}\n")
        }
      }
      writer.close()
}
*/

investment
make
leverage
money
time
property
leveraged
loan
can
good
old
owe
borrow
value
one
lot
financial
way
like
lose
profit
say
market
put
worth
understand
buy
might
tear
meltdown
risky
land
institution
ratio
find
downturn
invest
interest
get
call
hair
also
just
fashion
seem
want
start
increase
close
crisis
another
burst
repeat
lead
normal
economy
amount
little
bubble
bad
let
person
even
day
come
factor
need
new
now
see
drop
technique
system
back
still
sensible
shit
instead
leave
period
kind
certainly
selling
policy
supposedly
look
relatively
attractive
hilt
wave
link
lesson
surf
doubting
loss
mortgage
however
edge
article
nasty
internet
guy
credit
closely
wrong
run
unreasonable
okay
senseless
world
strict
stop
bankrupt
particular
stuff
happen
tick
mac
sum
onto
john
company
nicely
able
refer
practice
many
putt
voice
lend
insist
become
wouldn
pay
long
small
level
tidal
crash
total
use
government
powerful
point
unwittingly
reach
single
original
fashioned
instance
haven
ill
standard
