In [ ]:
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._

//import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}

// import org.apache.spark.ml.feature.StopWordsRemover // TODO:

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

//import org.apache.spark.mllib.feature.HashingTF._
//import org.apache.spark.mllib.feature.IDF._

//import org.apache.spark.mllib.linalg.Vectors
//import org.apache.spark.mllib.linalg.Vector
//import org.apache.spark.sql.Row

//import org.apache.spark.sql.SQLContext // test

import scala.collection.JavaConversions._
import scala.util.control.Breaks._
import java.io._
import java.nio.file.{Files, Path, Paths}

import opennlp.tools.langdetect._
import opennlp.tools.lemmatizer.DictionaryLemmatizer
import opennlp.tools.postag.{POSModel, POSTaggerME}
import opennlp.tools.tokenize.{TokenizerME, TokenizerModel}
//import opennlp.tools.util.Span

object Helpers {
    def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
        try {
            f(resource)
        } finally {
            resource.close()
        }
}

import Helpers._

val de : String = "de"
val en : String = "en"
val fr : String = "fr"
val langs : List[String] = List(de, en, fr)


val conf = new SparkConf().setAppName("words")
conf.set("spark.driver.allowMultipleContexts", "true");
conf.setMaster("local");
val sparkContext = new SparkContext(conf)
// since sparkContext is unavailable as var here, I can't just use it like in "Spark Dataset 101", "Spark 101"
// could be related to customDepts bug, because I added opennlp dependency:
// https://github.com/spark-notebook/spark-notebook/issues/563

val spark = SparkSession.builder().appName("words").master("local").getOrCreate()

def detectLang(line : String, dicts : scala.collection.mutable.Map[String, List[String]]) : Option[String] = {
    val langs = line.split(" ").flatMap(item => dicts.filter(_._2.exists(_.equalsIgnoreCase(item))).map(_._1))
                    .groupBy(f => f)
                    .map(g => (g._1, g._2.size))
    if(langs.isEmpty) None
    else Some(langs.maxBy(_._2)._1)
}

val stopwordsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty 
// Stopwords per each lang, are all stored in RAM, because used for all supported languages detection & they're relatively small, quick to read.  

for(lang <- langs) { // main loop. TODO: refactor into function and call it async
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-stopwords.txt")) { source => 
    for (line <- source.getLines) {
      val list = stopwordsPerLang.getOrElse(lang, List.empty)
      stopwordsPerLang.update(lang, list:+line)  
    }
  }
}

val textfilesPathsPerLang: scala.collection.mutable.Map[String, List[String]] = scala.collection.mutable.Map.empty
val textfiles = Files.newDirectoryStream(Paths.get("notebooks/words/text-data"))
                     .filter(_.getFileName.toString.endsWith(".txt"))

for(filePath <- textfiles) {
  val file = filePath.toString
  using(scala.io.Source.fromFile(file)) { source => 
    val firstLine = source.getLines.next() // get single first line & detect language with it, TODO: better to use a few random lines in the middle of the text
    detectLang(firstLine, stopwordsPerLang) match  {
      case Some(lang) => {
        var list = textfilesPathsPerLang.getOrElse(lang, List.empty)
        textfilesPathsPerLang.update(lang, list:+file)        
      }
      case None => println("Language was not detected for file: $file")
    }                                              
  }
}
/*
case class Lemma(entry:String, default:String, pos:String)
object Lemma {
  def parse(line:String): Option[Lemma] = {
    line.split("\t") match { 
      case Array(f,s,t,_*) => Some(Lemma(f, s, t))
      case _ => None
    }
  }  
}
*/
// ?
def removeTextNoise(text:String) : String = {
  val removedNumbers = text.filter(!_.isDigit)
  // https://stackoverflow.com/questions/30074109/removing-punctuation-marks-form-text-in-scala-spark
  val removedWordsOfSizeLessEqual2AndPunctuation = removedNumbers.replaceAll("""([\p{Punct}&&[^.]]|\b\p{IsLetter}{1,2}\b)\s*""", "")
  // https://stackoverflow.com/questions/6198986/how-can-i-replace-non-printable-unicode-characters-in-java
  val removedUnicodes = removedWordsOfSizeLessEqual2AndPunctuation.replaceAll("""[\p{C}]""", "")
  val replacedEscapeSeqWithSpace =  removedUnicodes.replaceAll("""[\t\n\r\f\v]""", " ")
  replacedEscapeSeqWithSpace
}


/*
def readLemmas(lang:String) : scala.collection.mutable.ListBuffer[Lemma] = {
  var lemmas : scala.collection.mutable.ListBuffer[Lemma] = scala.collection.mutable.ListBuffer.empty  
  using(scala.io.Source.fromFile(s"notebooks/words/vocabs/$lang-lemmatizer.txt")) { source => 
    for (line <- source.getLines) {
      Lemma.parse(line) match { 
        case Some(lemma) => lemmas+=lemma
        case _ =>;
      }      
    }
  }
  lemmas
}
*/
/*
  def initTokenizer(lang:String) = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-token.bin")) { stream =>
      val model : TokenizerModel = new TokenizerModel(stream)
      new TokenizerME(model);
    }
*/
class OpenNlpTokenizer(val model: TokenizerModel) {//extends Tokenizer {
  def this(lang:String) = this(OpenNlpTokenizer.loadDefaultModel(lang))

  val tokenizer = new TokenizerME(model)

  def tokenize(text: String): Seq[String] = {
    val positions = tokenizer.tokenizePos(text)
    val strings = positions.map {
      pos => text.substring(pos.getStart, pos.getEnd)
    }
    //assume(positions.length == strings.length)
    //for ((pos, string) <- (positions.iterator zip strings.iterator).toList)
    //yield new Token(string, pos.getStart)
    strings
  }  
}

object OpenNlpTokenizer {
  def loadDefaultModel(lang:String): TokenizerModel = {
    using(new FileInputStream(s"notebooks/words/vocabs/$lang-token.bin")) { stream =>
      new TokenizerModel(stream)
    }
  }
}
/*
object OpenNlpTokenizerMain extends TokenizerMain {
  val tokenizer = new OpenNlpTokenizer()
}
*/

def removeStopWords(lang: String, tokens:Seq[String]) : Seq[String] = {
   tokens.filter(!stopwordsPerLang(lang).contains(_))
}


for ((lang,textsPaths) <- textfilesPathsPerLang) { // main loop. TODO: refactor into function and call it async
   //val lemmas = readLemmas(lang)
  val tokenizer = new OpenNlpTokenizer(lang)
  
  for (paths <- textsPaths) {
    using(scala.io.Source.fromFile(paths)) { source => 
      val text = source.getLines.mkString // getLines.toList for RDD...
      val unnoisedText = removeTextNoise(text)                                      
      val tokens = tokenizer.tokenize(unnoisedText)
      val tokensWithoutStopWords = removeStopWords(lang, tokens)
      //val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
      //import sqlContext.implicits._
         // : RDD[Seq[String]]                                    
      //val rdd = sparkContext.parallelize(tokens)
                                            //.map{ x:Row => x.getAs[String](0)}
                                            //.map(x => Tuple1(x.split(","))) // wrapping
      //val tf: RDD[Vector] = new HashingTF().transform(rdd)
      //tf.cache()                                                                   
      //val idf = new IDF().fit(tf)
      //val tfidf: RDD[Vector] = idf.transform(tf)
      //tfidf.foreach(x => println(x))
      
      //val tf = new HashingTF().transform(rdd)
      //val idf = new IDF().fit(tf)
      //val tfidf = idf.transform(tf)
      /*for(item <- tf){
        print(item)
      }*/
      //rescaledData.select("features").show()
      
      //val spark: SparkSession
      // Seq(1, tokens)
      val df = spark.createDataFrame(Array((1, tokensWithoutStopWords))).toDF("id", "tokens") 

      val model: CountVectorizerModel = new CountVectorizer()
          .setInputCol("tokens")
          .setOutputCol("features")
          .fit(df)
      model.vocabulary
                                                      
    }
  }  
}

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4.0 (TID 2, localhost, executor driver): java.util.NoSuchElementException: None.get
	at scala.None$.get(Option.scala:347)
	at scala.None$.get(Option.scala:345)
	at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
	at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:670)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:289)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage