## Import modules

In [1]:
classpath.addPath("tinyir-1.1.jar")



In [2]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [3]:
// import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mjava.io._[0m

In [4]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap}
// enables "mutable lists"
import scala.collection.mutable.ListBuffer  

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mscala.collection.mutable.ListBuffer[0m

## Define classes and functions

In [None]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    // tokenizes the text, remove stop words and stems
    // based on ch.ethz.dal.tinyir.processing
    // and on com.github.aztek.porterstemmer
    def token_stem() = {
        processing.StopWords.filterOutSW(                    // filter out any token which is a StopWord
        processing.Tokenizer.tokenize(head() ++ text())     // create tokens from headline and body
    ).map(x => PorterStemmer.stem(x).                       // apply stemming
          replaceAll("\\P{L}+", "")).filter(_.trim.nonEmpty)  // pattern matching to keep only alphabet
    }
}

In [None]:
def list_docs (path: String) = {  // : Array[java.io.File]
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

In [None]:
def write(data: MutMap[Double, (Double, Double, Double)],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).productIterator.toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_prediction(data: MutMap[Int, Set[String]],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

# Importing data files

In [None]:
val train_list = list_docs("documents")

In [None]:
val doc_list = new ListBuffer[processing.StringDocument]()
var counter = 0
for (path <- train_list.take(3)){
    counter += 1
    var cur_doc = new xml_doc(path)
    doc_list += (new processing.StringDocument(counter, cur_doc.token_stem.mkString(" ")))
}

In [None]:
val doc_stream = doc_list.toStream

In [None]:
val test_pos_index = new indexing.PosIndex(doc_stream)

In [None]:
test_pos_index.index
// test_pos_index.postings(doc_stream)

In [22]:
doc_stream(2).body

: 

In [None]:

/*
We want a DocStream

In [None]:
var t = io.DocStream.getStream("documents/AP880212-0006")

In [None]:
var t = new java.io.FileInputStream("documents")

In [None]:
public FileInputStream(FileDescriptor "documents")

In [None]:
var t = new java.io.File("documents").listFiles

In [None]:
for (doc in t){
    doc_list +=
}

In [5]:
val t = new io.DirStream("documents")

[36mt[0m: [32mio[0m.[32mDirStream[0m = ch.ethz.dal.tinyir.io.DirStream@37de0d29

In [6]:
t.stream

[36mres5[0m: [32mStream[0m[[32mInputStream[0m] = [33mStream[0m(
  java.io.BufferedInputStream@5657b756,
  java.io.BufferedInputStream@5334f8f6,
  java.io.BufferedInputStream@2e99bdf0,
  java.io.BufferedInputStream@4127f1ed,
  java.io.BufferedInputStream@5dd8c1e0,
  java.io.BufferedInputStream@295f71e5,
  java.io.BufferedInputStream@4a4a3f03,
  java.io.BufferedInputStream@341c073e,
  java.io.BufferedInputStream@2f50079c,
  java.io.BufferedInputStream@7d528bca,
  java.io.BufferedInputStream@66fb0144,
  java.io.BufferedInputStream@328acf52,
  java.io.BufferedInputStream@1d495278,
  java.io.BufferedInputStream@742c86c0,
  java.io.BufferedInputStream@4ae35820,
  java.io.BufferedInputStream@2ef7fe71,
  java.io.BufferedInputStream@797aa345,
  java.io.BufferedInputStream@6970202c,
  java.io.BufferedInputStream@3902477a,
[33m...[0m

In [84]:
collection(1).tokens

[36mres83[0m: [32mList[0m[[32mString[0m] = [33mList[0m(
  [32m"strategist"[0m,
  [32m"jack"[0m,
  [32m"kemp"[0m,
  [32m"presidentialcampaign"[0m,
  [32m"georg"[0m,
  [32m"bush"[0m,
  [32m"poor"[0m,
  [32m"showe"[0m,
  [32m"iowa"[0m,
  [32m"coupl"[0m,
  [32m"withkemp"[0m,
  [32m"tough"[0m,
  [32m"talk"[0m,
  [32m"ad"[0m,
  [32m"bob"[0m,
  [32m"dole"[0m,
  [32m"put"[0m,
  [32m"kemp"[0m,
  [32m"therun"[0m,
[33m...[0m

In [105]:
val collection = new io.TipsterStream("resources").stream

[36mcollection[0m: [32mStream[0m[[32mXMLDocument[0m] = [33mStream[0m(
  ch.ethz.dal.tinyir.processing.TipsterParse@6c351c10,
  ch.ethz.dal.tinyir.processing.TipsterParse@362cc1a,
  ch.ethz.dal.tinyir.processing.TipsterParse@594290eb,
  ch.ethz.dal.tinyir.processing.TipsterParse@776b5610,
  ch.ethz.dal.tinyir.processing.TipsterParse@5d02e740,
  ch.ethz.dal.tinyir.processing.TipsterParse@4ed8966e,
  ch.ethz.dal.tinyir.processing.TipsterParse@4bbb5a4c,
  ch.ethz.dal.tinyir.processing.TipsterParse@4311b1a2,
  ch.ethz.dal.tinyir.processing.TipsterParse@67b73982,
  ch.ethz.dal.tinyir.processing.TipsterParse@7d940d48,
  ch.ethz.dal.tinyir.processing.TipsterParse@311c9cd6,
  ch.ethz.dal.tinyir.processing.TipsterParse@55b48478,
  ch.ethz.dal.tinyir.processing.TipsterParse@7ecf6600,
  ch.ethz.dal.tinyir.processing.TipsterParse@34bd1776,
  ch.ethz.dal.tinyir.processing.TipsterParse@6a05e212,
  ch.ethz.dal.tinyir.processing.TipsterParse@1314337f,
  ch.ethz.dal.tinyir.processing.TipsterPars

In [106]:
val collection2 = new YMStream("resources").stream

[36mcollection2[0m: [32mStream[0m[[32mXMLDocument[0m] = [33mStream[0m(
  cmd90$$user$YMParse@422aec92,
  cmd90$$user$YMParse@7ae637a2,
  cmd90$$user$YMParse@7d5aa0f9,
  cmd90$$user$YMParse@262b106,
  cmd90$$user$YMParse@7aa701f7,
  cmd90$$user$YMParse@2cf85e54,
  cmd90$$user$YMParse@479debe0,
  cmd90$$user$YMParse@7ebe3bae,
  cmd90$$user$YMParse@7f49446f,
  cmd90$$user$YMParse@3dc03467,
  cmd90$$user$YMParse@338aa4ff,
  cmd90$$user$YMParse@7ee11a8a,
  cmd90$$user$YMParse@6aa3341a,
  cmd90$$user$YMParse@6760c017,
  cmd90$$user$YMParse@4d9a0103,
  cmd90$$user$YMParse@619df9fe,
  cmd90$$user$YMParse@344af29,
  cmd90$$user$YMParse@369f0720,
  cmd90$$user$YMParse@22207762,
[33m...[0m

In [117]:
collection(1).tokens.size

[36mres116[0m: [32mInt[0m = [32m473[0m

In [116]:
collection2(1).tokens.size

[36mres115[0m: [32mInt[0m = [32m291[0m

In [23]:
Helper.stemTokens(collection(1).tokens)

[36mres22[0m: [32mList[0m[[32mString[0m] = [33mList[0m(
  [32m"strategist"[0m,
  [32m"for"[0m,
  [32m"jack"[0m,
  [32m"kemp"[0m,
  [32m"presidentialcampaign"[0m,
  [32m"sai"[0m,
  [32m"georg"[0m,
  [32m"bush"[0m,
  [32m"poor"[0m,
  [32m"showe"[0m,
  [32m"iowa"[0m,
  [32m"coupl"[0m,
  [32m"withkemp"[0m,
  [32m"tough-talk"[0m,
  [32m"ad"[0m,
  [32m"against"[0m,
  [32m"bob"[0m,
  [32m"dole"[0m,
  [32m"could"[0m,
[33m...[0m

In [99]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [76]:
collection(1).body

[36mres75[0m: [32mString[0m = [32m"   Strategists for Jack Kemp's presidentialcampaign say George Bush's poor showing in Iowa, coupled withKemp's tough-talking ads against Bob Dole, could put Kemp in therunning for the Republican nomination.   Before last Monday's Iowa caucuses, Kemp had been on a roll inNew Hampshire, using an effective advertising campaign and theendorsement of the influential Concord Monitor to help broadensupport.   But even as his bid to become the conservative alternative toDole and Bush took shape, the New York congressman faced thepossibility of being swept out of the race by former televisionevangelist Pat Robertson.   Using his second-place finish in Iowa _ Kemp came in fourth _Robertson was trying to move Kemp aside and take the mantle of trueconservative. A poll of New Hampshire voters by the Boston GlobeThursday night indicated Robertson had drawn even with Kemp amongpotential Republican voters in next Tuesday's primary.   Charles Douglas, Kemp's New 

In [101]:
token_filter(collection(1).body).size

[36mres100[0m: [32mInt[0m = [32m286[0m

In [74]:
collection(1).content

[36mres73[0m: [32mString[0m = [32m"   Strategists for Jack Kemp's presidentialcampaign say George Bush's poor showing in Iowa, coupled withKemp's tough-talking ads against Bob Dole, could put Kemp in therunning for the Republican nomination.   Before last Monday's Iowa caucuses, Kemp had been on a roll inNew Hampshire, using an effective advertising campaign and theendorsement of the influential Concord Monitor to help broadensupport.   But even as his bid to become the conservative alternative toDole and Bush took shape, the New York congressman faced thepossibility of being swept out of the race by former televisionevangelist Pat Robertson.   Using his second-place finish in Iowa _ Kemp came in fourth _Robertson was trying to move Kemp aside and take the mantle of trueconservative. A poll of New Hampshire voters by the Boston GlobeThursday night indicated Robertson had drawn even with Kemp amongpotential Republican voters in next Tuesday's primary.   Charles Douglas, Kemp's New 

In [64]:
var t = "abcdefghijklmmnopq"

[36mt[0m: [32mString[0m = [32m"abcdefghijklmmnopq"[0m

In [30]:
val test_pos_index = new indexing.PosIndex(collection)

: 

In [38]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)

Used Memory:  1545
Free Memory:  1509
Total Memory: 3055
Max Memory:   3641


[36mmb[0m: [32mInt[0m = [32m1048576[0m
[36mruntime[0m: [32mRuntime[0m = java.lang.Runtime@45e471e3

In [53]:
collection(1).title

[36mres52[0m: [32mString[0m = [32m""[0m

In [104]:
class YMParse(is: InputStream) extends processing.TipsterParse(is) { 
  override def tokens: List[String] = token_filter(content)
}

defined [32mclass [36mYMParse[0m

In [92]:
import ch.ethz.dal.tinyir.processing.TipsterParse
import ch.ethz.dal.tinyir.processing.Tokenizer
import ch.ethz.dal.tinyir.processing.XMLDocument

class YMStream (path: String, ext: String = "") extends io.ParsedXMLStream(new io.ZipDirStream(path, "")){
  def stream : Stream[XMLDocument] = unparsed.stream.map(is => new YMParse(is))
  def length = unparsed.length 
}



[32mimport [36mch.ethz.dal.tinyir.processing.TipsterParse[0m
[32mimport [36mch.ethz.dal.tinyir.processing.Tokenizer[0m
[32mimport [36mch.ethz.dal.tinyir.processing.XMLDocument[0m
defined [32mclass [36mYMStream[0m

In [104]:

class TipsterParse(is: InputStream) extends XMLDocument(is) { 
  override def title  : String = "" 
  override def body   : String = read(doc.getElementsByTagName("TEXT"))
  override def name   : String = read(doc.getElementsByTagName("DOCNO")).filter(_.isLetterOrDigit)
  override def date   : String = ""
  override def content: String = body  
}

object TipsterParse {
  def main(args: Array[String]) {
    val dirname = "/Users/thofmann/Data/Tipster/sample"
    val fname = dirname + "/DOE2-84-0001"
    val parse = new TipsterParse(DocStream.getStream(fname))
    val name = parse.name
    println(name)    
    val content = parse.content 
    println(content.take(20) + "..." + content.takeRight(20))
  }
}
    

: 

In [None]:
package ch.ethz.dal.tinyir.io

import ch.ethz.dal.tinyir.processing.TipsterParse
import ch.ethz.dal.tinyir.processing.Tokenizer
import ch.ethz.dal.tinyir.processing.XMLDocument

class TipsterStream (path: String, ext: String = "") 
extends ParsedXMLStream(new ZipDirStream(path, "")){
  def stream : Stream[XMLDocument] = unparsed.stream.map(is => new TipsterParse(is))
  def length = unparsed.length 
}

object TipsterStream  {

  def main(args: Array[String]) {
    val tipster = new TipsterStream ("/Users/thofmann/Data/Tipster/zips")  
    println("Number of files in zips = " + tipster.length)
    
    var length : Long = 0 
    var tokens : Long = 0
    for (doc <- tipster.stream.take(10000)) { 
      length += doc.content.length          
      tokens += doc.tokens.length
    }
    println("Final number of characters = " + length)
    println("Final number of tokens     = " + tokens)
  }
}

In [15]:
import scala.collection.mutable.{Map => MutMap}
import com.github.aztek.porterstemmer.PorterStemmer
import scala.collection.mutable.Iterable
import java.io.File
import java.io.FileWriter

class Helper {
}

object Helper {
  val ZIP_PATH = "/zips"
  val QRELS_PATH = "/qrels"
  val TOPIC_PATH = "/topics"
    
  val OUTPUT_FILE = "/output/ranking-M-tony-beltramelli.run"
  
  val RESULT_NUMBER = 100
  var TOKEN_MAX_SIZE = 100000

  val IS_DEBUG_MODE: Boolean = false

  private var _rootPath = ""
  
  private var _i = 0
  private var _time : Long = System.nanoTime()
  
  def getPath(r: String ) : String = {
    _rootPath + r
  }
  
  def setRootPath(r: String) {
    _rootPath = r
  }

  def debug(s: Any) {
    if (!IS_DEBUG_MODE) return
    println(s)
  }
  
  private val _stemStore : MutMap[String, String] = MutMap()
  
  def stemTokens(list: List[String]) : List[String] = 
  {
    if(_stemStore.size > TOKEN_MAX_SIZE) _stemStore.clear
	  
    list.map(t => t.toLowerCase()).map(v => _stemStore.getOrElseUpdate(v, PorterStemmer.stem(v)))
  }
  
  def time {
    println("time "+_i+" : " + (System.nanoTime() - _time) / 1000000000.0 + " seconds")
    _i += 1
  }
	
  def log2(x: Double) = Math.log10(x) / Math.log10(2.0)
  
  def flipDimensions(original : Iterable[(String, List[(Int, Double)])]) : Map[Int, Iterable[(String, Double)]] =
  {
    val flatten = for {
      (s, v) <- original
      (i, d) <- v
    } yield (i, s, d)
	
    implicit class RichTuple2[A, B, C](t: (A, B, C)) {
      def tail: (B, C) = (t._2, t._3)
    }
	
    flatten.groupBy(_._1).mapValues(_.map(_.tail))
  }
  
  def printToFile(results : Map[Int, List[(String, Double)]], topics : List[(String, Int)], useLanguageModel : Boolean)
  {
    val file = new File(_rootPath + OUTPUT_FILE.replace('M', if(useLanguageModel) 'l' else 't'))
    file.getParentFile.mkdirs
    
    val fw = new FileWriter(file)
    
    results.foreach(r => r._2.zipWithIndex.foreach{case(l, i) => fw.write(topics(r._1)._2.toString + " " + (i + 1) + " " + l._1 + "\n")})
    
    fw.close
  }
}

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m
[32mimport [36mscala.collection.mutable.Iterable[0m
[32mimport [36mjava.io.File[0m
[32mimport [36mjava.io.FileWriter[0m
defined [32mclass [36mHelper[0m
defined [32mobject [36mHelper[0m

In [None]:
def HashMap_get_count(w: Seq[String]): scala.collection.immutable.Map[String, Int] = {
    // map(token -> count)
    // auxilary function to find duplicate elements
    // and return their counts in a Seq
        val map = scala.collection.immutable.HashMap[String, Int]().withDefaultValue(0)
        w.foldLeft(map)((m, c) => m + (c -> (m(c) + 1)) )
    }

In [None]:
def get_pruned_vocab(total_vocab: scala.collection.mutable.Map[String, Int], min_count: Int)={
    val temp = total_vocab.clone
    temp.retain((k,v) => v > min_count)
}

In [None]:
def total_vocab(file_list: Array[File])={
    val docwords = MutMap[String, Int]()
    for(iter<-0 to file_list.length - 1) {
        var doc = new xml_doc(file_list(iter).toString)
        var tokens = doc.token_stem
        for(t <- tokens){
            var temp = docwords.getOrElseUpdate(t, 0)  
            docwords.update(t,temp+1)
        }
    }
    docwords
}

In [None]:
def assess(retrievedTopics: Set[String], expecedTopics: Set[String]) = {
        val truePositive = (retrievedTopics & expecedTopics).size
        val falsePositive = (retrievedTopics -- expecedTopics).size
        val falseNegative = (expecedTopics -- retrievedTopics).size

    val precision = truePositive.toDouble / (truePositive + falsePositive)
    val recall = truePositive.toDouble / (truePositive + falseNegative)
    var f1Score = 2 * (precision * recall) / (precision + recall)
    f1Score = if (f1Score.isNaN) {0} else {f1Score}
    
    (f1Score, precision, recall)
}

# Importing data files

In [None]:
val train_list = list_xml_files("data/train")
val val_list = list_xml_files("data/validation")
val test_list = list_xml_files("data/test")

# Document maps creation

In [None]:
def get_doc_maps(file_list: Array[File],
               iter_start: Int = 0,
               iter_end: Int, pruned_vocab_set: collection.Set[String]) = {
    
    //document index -> ((term -> tfs), size, topics)
    val _documents = MutMap[Int, 
                            (scala.collection.Map[String, Int], 
                             Int, Set[String])]()
    
    //class name -> document indexes
    val _classesToDoc = MutMap[String, Set[Int]]()
    
//     var iter = iter_start
    
    for(iter <- iter_start to iter_end - 1){
        
        // get tokens
        var path = file_list(iter).toString
        var cur_doc = new xml_doc(path)
        var tokens = cur_doc.token_stem
        
        var pruned_tokens = tokens.filter(token=>pruned_vocab_set(token))
        
        // get labels
        var labels = cur_doc.labels.toSet
        
        var ID =  cur_doc.id

        _documents += ID -> ((HashMap_get_count(pruned_tokens), pruned_tokens.length, labels))
    
        for(c <- labels){
            val cl = _classesToDoc.getOrElseUpdate(c, Set[Int]())  
            _classesToDoc.update(c, cl + ID)
        }
        
        if( (iter + 1) % 200 == 0){
            println(s"Current iteration: ${iter + 1}")
        }
    }
(_documents, _classesToDoc)
}

In [None]:
val vocab = total_vocab(file_list)
val pruned_vocab = get_pruned_vocab(vocab,1)//prune vocabulary to remove tokens with frequency 1
val documents = get_doc_maps(train_list, iter_end = train_list.length, pruned_vocab_set=pruned_vocab.keys.toSet)

In [None]:
//document index -> ((term -> tfs), size, topics)
val documents_tf_map = documents._1

//class name -> document indexes
val class_to_doc = documents._2

In [None]:
//term -> tf-idf
def get_inverseFreq(
    documents_tf_map: MutMap[Int, 
                            (Map[String, Int], 
                             Int, Set[String])]) = {
    
    var inverseFreq = MutMap[String, Double]()
    
    var iter = 0
    for(d <- documents_tf_map.map(_._2._1)) {
        for(t <- d) {
            val v = inverseFreq.getOrElse(t._1, -1.0)

            if(v >= 0) inverseFreq.update(t._1, t._2 + v) else inverseFreq += t._1 -> t._2
        }
        if( (iter + 1) % 50 == 0){
            println(s"Current iteration: ${iter + 1}")
            }
        iter += 1
    }
    inverseFreq = inverseFreq.map(f => f._1 -> (Math.log(iter) - Math.log(f._2)))
    inverseFreq
}

In [None]:
val inverseFreq = get_inverseFreq(documents_tf_map)

# Logistic regression

In [None]:
// helper functions

def sigmoid(theta: MutMap[String, Double], documentFeatures: Map[String, Double]) = {
    1.0 / (1.0 + Math.exp(-dot_product(documentFeatures, theta)))
}

def dot_product(vector1: Map[String, Double], vector2: MutMap[String, Double]) = {
    vector1.map(v => v._2 * vector2.getOrElse(v._1, 0.0)).sum
}

def scalar_product(vector: Map[String, Double], scalar: Double) = {
    vector.mapValues(v => v * scalar)
}

def scalar_product_mut(vector: MutMap[String, Double], scalar: Double) = {
    vector.mapValues(v => v * scalar)
}

def add_map(vector1: Map[String, Double], vector2: MutMap[String, Double]) = {
    vector1.map(v => v._1 -> (vector2.getOrElse(v._1, 0.0) + v._2))
}

def add_map_immut(vector1: Map[String, Double], vector2: scala.collection.Map[String,Double]) = {
    vector1.map(v => v._1 -> (vector2.getOrElse(v._1, 0.0) + v._2))
}

def takeRandomN[A](n: Int, org_list: Set[A]) ={
  scala.util.Random.shuffle(org_list).take(n)
}

def sum_square (some_map: MutMap[String, Double]) = {
    some_map.values.foldLeft(0.0)(_ + Math.pow(_, 2))
}

In [None]:
object Logistic_reg {
    
    // init placeholder vars
    var _classesToDoc = MutMap[String, Set[Int]]() //class name -> document indexes
    
    var _documents = MutMap[Int, (Map[String, Int], Int, Set[String])]() //document index -> ((term -> tfs), size, topics)
    
    var _inverseFreq = MutMap[String, Double]()  //term -> tf-idf
    
    var corp_size = 0
    
    var label_to_weight = MutMap[String, MutMap[String, Double]]()  // label -> (token -> weight)
    
    var lambda = 0.0  // regularization constant
    
    var step_size = 1.0  // for weights update
    
    // set placeholder variables
    def set_class_to_doc(value: MutMap[String, Set[Int]]) = {
        _classesToDoc = value
    }
    
    def set_documents_tf_map(value: MutMap[Int, (Map[String, Int], Int, Set[String])]) = {
        _documents = value
        corp_size = value.size
    }
    
    def set_inverseFreq(value: MutMap[String, Double]) = {
        _inverseFreq = value
    }
    
    def set_lambda(value: Double) = {
        lambda = value
    }
    
    def set_step_size(value: Double) = {
        step_size = value
    }
    
    // generate prediction
    def getProb(documentFeatures: Map[String, Double],
               theta: MutMap[String, Double]) = {
        sigmoid(theta, documentFeatures)
    }
    
    // gradient calculates for a SINGLE LABEL
    // update weights
    def gradient(theta: MutMap[String, Double], 
                  documentFeatures: Map[String, Double], 
                  y: Boolean) : Map[String, Double] = {
        val loss_contrib = if(y) 1 - sigmoid(theta, documentFeatures) else -sigmoid(theta, documentFeatures)
        val grad = scalar_product(documentFeatures, loss_contrib * step_size)
        val reg_shrink = scalar_product_mut(theta, -lambda * step_size)
        
        add_map(
            add_map_immut(grad, reg_shrink)
            ,theta)
    }
    
    // train on a single training example (doc)
    def train_on_label(label: String,
                       documentFeatures: Map[String, Double], 
                       y: Boolean) =  {
        
        if (!(label_to_weight contains label)){
            label_to_weight.update(label, MutMap[String, Double]().withDefaultValue(0.5))
        }
        label_to_weight(label) ++= gradient(label_to_weight(label), 
                                             documentFeatures, y)
    }
    
    // train on a signle "topic" with a set of docs
    def _train(label: String) = {
        // train on a set of docs (identified by docIndex)
        
        val doc_collection = takeRandomN(800, RandomDocuments(label))
        
        for(docIndex <- doc_collection) {

            val doc = _documents(docIndex)
            val y = doc._3.contains(label)
            train_on_label(label, doc._1.map(f => f._1 -> _inverseFreq(f._1)), y)
        }
    }
    
    def trainAll = {
        var iter = 1
        for(classToDoc <- _classesToDoc) {
            _train(classToDoc._1)
            if(iter % 25 == 0){
                println(s"Current iteration: label #${iter}")
            }
            iter += 1
        }
    }
    
    def RandomDocuments(trueTopic: String) = {
        val random = new Random
        var documents = _classesToDoc(trueTopic)

        var sample_size = documents.size * 3
        sample_size = if(sample_size > corp_size) {
            corp_size} else {sample_size}

        documents = documents ++ takeRandomN(sample_size, _documents.keys.toSet)

        random.shuffle(documents)
    }
    
    // predicting a set of classes, given a list of tokens
    def predict(tokens: Set[String], threshold: Double, cut_num: Int = 7) : Set[String]= {
        
        var label_to_prob = MutMap[String, Double]()
        
        val documentFeatures = tokens.map(f => f -> _inverseFreq.getOrElse(f, 0.0)).filter(_._2 > 0.0).toMap
        
        for(classToDoc <- _classesToDoc) {
            val label = classToDoc._1
            val weights = label_to_weight(label)
            // insert a (label -> probability) entry
            label_to_prob(label) = getProb(documentFeatures,
                                           weights)
        }
        
        // keep labels with probability higher than threshold in a set
        label_to_prob.filter({case (k,v) => v > threshold}).  // remove labels below threshold
            toSeq.sortWith(_._2 > _._2).take(cut_num).map(x => x._1).toSet  // keep cut_num labels with max probability
        
    }
}

In [None]:
def assess_files_logistic(file_list: Array[File],
                         threshold: Double = 0.95) = {
    // calculate F score for all docs in validation set
    var f1_score = Set[Double]()
    var precision = Set[Double]()
    var recall = Set[Double]()

    for(path <- file_list) {

        var iter = 1
        val current_doc = new xml_doc(path.toString)

        var cur_scores = assess(
            Logistic_reg.predict(current_doc.token_stem.toSet, threshold),
            current_doc.labels.toSet)

        f1_score += cur_scores._1
        precision += cur_scores._2
        recall += cur_scores._3

        if( iter % 25 == 0){
            println(s"Current iteration: #${iter}, working on file ${path.toString}")
        }
        iter += 1
    }
    (f1_score.sum / f1_score.size,
     precision.sum / precision.size,
     recall.sum / recall.size)
}

In [None]:
// init map values
Logistic_reg.set_class_to_doc(class_to_doc)
Logistic_reg.set_documents_tf_map(documents_tf_map)
Logistic_reg.set_inverseFreq(inverseFreq)

In [None]:
var hyper_param_res = MutMap[Double, (Double, Double, Double)]()
val hyper_param_list = List.tabulate(6)(x => Math.pow(10, -(x+1) )).tail
val step_size_list = List.range(1,6).map(x => 1 / Math.sqrt(x))

In [None]:
for (hyp_param <- hyper_param_list) {
    // set lambda value
    Logistic_reg.set_lambda(hyp_param)
    println(s"Lambda set to $hyp_param")
    println("New training cycle starting")
    
    // reset weights before training
    Logistic_reg.label_to_weight = 
        MutMap[String, collection.mutable.Map[String, Double]]()
    
    // train epochs
    for(step <- step_size_list) {
        Logistic_reg.set_step_size(step)
        println(s"Step size set to $step")
        println()
        Logistic_reg.trainAll
    }

    hyper_param_res(hyp_param) = assess_files_logistic(val_list)
}

In [None]:
write(hyper_param_res, "logistic_reg_hyperparam.txt")

In [None]:
// train on expanded set and produce final prediction
val expanded_train_list = train_list ++ val_list
val log_reg_lambda = 0.001

Logistic_reg.set_lambda(log_reg_lambda)
// reset weights before training
Logistic_reg.label_to_weight = 
    MutMap[String, collection.mutable.Map[String, Double]]()

// train epochs
for(step <- step_size_list) {
    Logistic_reg.set_step_size(step)
    println(s"Step size set to $step")
    println()
    Logistic_reg.trainAll
}

In [None]:
// predict on test set
var log_reg_test_predict = MutMap[Int, Set[String]]()
val numPattern = "[0-9]+".r

var iter = 1

for (path <- test_list) {
    var ID = numPattern.findFirstIn(path.toString).get.toInt
    var current_doc = new xml_doc(path.toString).token_stem.toSet
    
    log_reg_test_predict(ID) = Logistic_reg.predict(current_doc, 0.95)

    if( iter % 100 == 0){
        println(s"Current iteration: #${iter}, predicting for file ${path.toString}")
    }
    iter += 1
}

In [None]:
// write predictions to file
write_prediction(log_reg_test_predict, "ir-2016-1-project-24-lr.txt")

# SVM

In [None]:
object Svm {
    // init placeholder vars
    var _classesToDoc = MutMap[String, Set[Int]]() //class name -> document indexes
    
    var _documents = MutMap[Int, (Map[String, Int], Int, Set[String])]() //document index -> ((term -> tfs), size, topics)
    
    var _inverseFreq = MutMap[String, Double]()  //term -> tf-idf
    
    var corp_size = 0
    
    var label_to_weight = MutMap[String, MutMap[String, Double]]()  // label -> (token -> weight)
    
    var lambda = 0.1  // needed for projection, |theta|_2 <= 1/sqrt(lambda)
    
    var step_size = 1.0  // for weights update
    
    // set placeholder variables
    def set_class_to_doc(value: MutMap[String, Set[Int]]) = {
        _classesToDoc = value
    }
    
    def set_documents_tf_map(value: MutMap[Int, (Map[String, Int], Int, Set[String])]) = {
        _documents = value
        corp_size = value.size
    }
    
    def set_inverseFreq(value: MutMap[String, Double]) = {
        _inverseFreq = value
    }
    
    def set_lambda(value: Double) = {
        lambda = value
    }
    
    def set_step_size(value: Double) = {
        step_size = value
    }
    
    // generate prediction score (dot product)
    def getScore(documentFeatures: Map[String, Double],
               theta: MutMap[String, Double]) = {
        dot_product(documentFeatures, theta)
    }
    
    // gradient calculates for a SINGLE LABEL
    // update weights
    def gradient(theta: MutMap[String, Double], 
                  documentFeatures: Map[String, Double], 
                  y: Boolean) = {
        
        val y_label = if(y) 1 else -1
        
        val theta_regular = scalar_product_mut(theta, 1 - step_size * lambda)
        
        val hinge = 1.0 - y_label * dot_product(documentFeatures, theta)
        
        if(hinge <= 0) { theta_regular }
        else {
            val projection = Math.min(
                1 , (1.0 / Math.sqrt(lambda + sum_square(theta)) )
            )
            scalar_product(add_map_immut(
                scalar_product(documentFeatures, step_size * y_label),
                theta_regular), 
                                  projection)
        }
    }
    
    // train on a single training example (doc)
    def train_on_label(label: String,
                       documentFeatures: Map[String, Double], 
                       y: Boolean) =  {
        
        if (!(label_to_weight contains label)){
            label_to_weight.update(label, MutMap[String, Double]().withDefaultValue(0.5))
        }
        
        label_to_weight(label) ++= gradient(label_to_weight(label), 
                                             documentFeatures, y)
    }
    
    // train on a single "topic" with a set of docs
    def _train(label: String) = {
        // train on a set of docs (identified by docIndex)
        val doc_collection = takeRandomN(800, RandomDocuments(label))

        for(docIndex <- doc_collection) {

            val doc = _documents(docIndex)
            val y = doc._3.contains(label)
            
            train_on_label(label, doc._1.map(f => f._1 -> _inverseFreq(f._1)), y)

        }
    }

    def RandomDocuments(trueTopic: String) = {
        val random = new Random
        var documents = _classesToDoc(trueTopic)

        var sample_size = documents.size * 3
        sample_size = if(sample_size > corp_size) {
            corp_size} else {sample_size}

        documents = documents ++ takeRandomN(sample_size, _documents.keys.toSet)

        random.shuffle(documents)
    }
    
    def trainAll = {
        var iter = 1
        
        for(classToDoc <- _classesToDoc)
        {
            _train(classToDoc._1)
            if( iter % 25 == 0){
                println(s"Current iteration: label #${iter}")
            }
            iter += 1
        }
    }
    
    // predicting a set of classes, given a set of tokens
    def predict(tokens: Set[String], cut_num: Int = 7) : Set[String]= {
        
        var label_to_prob = MutMap[String, Double]()
        
        val documentFeatures = tokens.map(f => f -> _inverseFreq.getOrElse(f, 0.0)).filter(_._2 > 0.0).toMap
        
        for(classToDoc <- _classesToDoc) {
            val label = classToDoc._1
            val weights = label_to_weight(label)
            // insert a (label -> probability) entry
            label_to_prob(label) = getScore(documentFeatures,
                                           weights)
        }
        
        // keep labels with probability higher than threshold in a set
        label_to_prob.filter({case (k,v) => v > 0}).  // remove labels below threshold
            toSeq.sortWith(_._2 > _._2).take(cut_num).map(x => x._1).toSet  // keep cut_num labels with max probability
    }
}

In [None]:
def assess_files_svm(file_list: Array[File],
                         threshold: Double = 0.95) = {
    // calculate F score for all docs in validation set
    var f1_score = Set[Double]()
    var precision = Set[Double]()
    var recall = Set[Double]()

    for(path <- file_list) {

        var iter = 1
        val current_doc = new xml_doc(path.toString)

        var cur_scores = assess(
            Svm.predict(current_doc.token_stem.toSet),
            current_doc.labels.toSet)

        f1_score += cur_scores._1
        precision += cur_scores._2
        recall += cur_scores._3

        if( iter % 25 == 0){
            println(s"Current iteration: #${iter}, working on file ${path.toString}")
        }
        iter += 1
    }
    (f1_score.sum / f1_score.size,
     precision.sum / precision.size,
     recall.sum / recall.size)
}

In [None]:
// init map values
Svm.set_class_to_doc(class_to_doc)
Svm.set_documents_tf_map(documents_tf_map)
Svm.set_inverseFreq(inverseFreq)

In [None]:
var hyper_param_res = MutMap[Double, (Double, Double, Double)]()
val hyper_param_list = List.tabulate(6)(x => Math.pow(10, -(x+1) )).tail
val step_size_list = List.range(1,6).map(x => 1 / Math.sqrt(x))

for (hyp_param <- hyper_param_list) {
    // set lambda value
    Svm.set_lambda(hyp_param)
    println(s"Lambda set to $hyp_param")
    println("New training cycle starting")
    
    // reset weights before training
    Svm.label_to_weight = 
        MutMap[String, collection.mutable.Map[String, Double]]()
    
    // train epochs
    for(step <- step_size_list) {
        Svm.set_step_size(step)
        println(s"Step size set to $step")
        println()
        Svm.trainAll
    }

    hyper_param_res(hyp_param) = assess_files_svm(val_list)
}

In [None]:
write(hyper_param_res, "svm_hyperparam.txt")

In [None]:
// train on expanded set and produce final prediction
val expanded_train_list = train_list ++ val_list
val svm_lambda = 0.01

Svm.set_lambda(svm_lambda)
// reset weights before training
Svm.label_to_weight = 
    MutMap[String, collection.mutable.Map[String, Double]]()

// train epochs
for(step <- step_size_list) {
    Svm.set_step_size(step)
    println(s"Step size set to $step")
    println()
    Svm.trainAll
}

In [None]:
// predict on test set
var svm_test_predict = MutMap[Int, Set[String]]()
val numPattern = "[0-9]+".r

var iter = 1

for (path <- test_list) {
    var ID = numPattern.findFirstIn(path.toString).get.toInt
    var current_doc = new xml_doc(path.toString).token_stem.toSet
    
    svm_test_predict(ID) = Svm.predict(current_doc)

    if( iter % 100 == 0){
        println(s"Current iteration: #${iter}, predicting for file ${path.toString}")
    }
    iter += 1
}

In [None]:
// write predictions to file
write_prediction(svm_test_predict, "ir-2016-1-project-24-svm.txt")

# Naive Bayes

In [None]:
/** Main class of Naive Bayes
* Parameters: alpha - Laplace Smoothing Parameter
              threshold - the threshold for the ratio of log probabilities for classification
              filter_size - words with frequency less than the filter size for a given class are removed from the training set of that class
              cut - maximum number of classes to take per document
*/

class NBayes(alpha: Double, threshold: Double, filter_size: Int, cut: Int){

// Hyperparameters and intermediate representations (created and used)
    val _alpha = alpha 
    val _threshold = threshold
    val _cut = cut
    val _filter_size = filter_size

    val supermap = MutMap[String, (Map[String, Double], Double)]() // class -> term -> term.frequency map, size of all documents to that class
    val prior = MutMap[String,Double]()
    val c_inv = MutMap[String,Set[Int]]()
    val prior_inv = MutMap[String,Double]()
    val supermap_inv = MutMap[String, (Map[String, Double], Double)]() // class -> term -> term.frequency map, size of all documents to that class
    var vocabulary_size = 0
    var vocabulary = Set[String]()

/** Function to train the classifier
* Parameters: c - Map of classes to Set of Document IDs which have that class
              d - Map of document ID to (map of (word to count),document size, set of cllass labels)
  Creates a map, which is used to determine the prior probability values for each class and the conditional probabilities of the token given the class label
*/
    def train(c: scala.collection.Map[String,scala.collection.immutable.Set[Int]], d: scala.collection.Map[Int,(scala.collection.Map[String,Int], Int, Set[String])])
    {
        // ttfbyc is a map for each class from terms (across all documents in that class) to term-frequency in these documents
        // cdocsize is the size of the documents labelled with that class
        // supermap stores both these values
        for (cl <- c)
        {
            prior += cl._1 -> Math.log((cl._2.size.toDouble / d.size))
            val ttfbyc = c(cl._1).toList.flatMap(di => d(di)._1).groupBy(x=>x._1).mapValues(x=> x.map(x=>x._2).sum.toDouble).filter(_._2>_filter_size) 
            val cdocsize = ttfbyc.values.sum
            supermap += cl._1 -> ((ttfbyc, cdocsize))
            vocabulary = vocabulary ++ ttfbyc.keys.toSet
        }   

        // INVERSE TRAINING
        // create the inverse class map, i.e. the mapping of each class to its complement document set (see report)
        
        val c_inv = c.mapValues(t => d.keys.toSet -- t)
        println("created c_inv")
        for (cl <- c_inv)
        {
            prior_inv += cl._1 -> Math.log((cl._2.size.toDouble / d.size))
            val ttfbyc = c_inv(cl._1).take(10000).toList.flatMap(di => d(di)._1).groupBy(x=>x._1).mapValues(x=> x.map(x=>x._2).sum.toDouble).filter(_._2>_filter_size) 
            val cdocsize = ttfbyc.values.sum
            supermap_inv += cl._1 -> ((ttfbyc, cdocsize))
            vocabulary = vocabulary ++ ttfbyc.keys.toSet
        }   
        vocabulary_size = vocabulary.size
    }
    
/** Function to get the conditional probability of a token given the class
* Parameters: term - the token
              cl - the  class
  Determines conditional probabilities of the token given the class label
*/
    
        def getCondi(term: String, cl: String) =
    {
        // Using the supermap, we implement Laplace smoothing using hyperparameter provided
        val sumTf = supermap(cl)._1.getOrElse(term,0.0) + alpha
        val sumDocSize = supermap(cl)._2 + (alpha * vocabulary_size)
        Math.log(sumTf / sumDocSize.toDouble)
    }

        
        def getCondi_inv(term: String, cl: String) =
    {
        // the same is done for the complementary set of documents
        val sumTf = supermap_inv(cl)._1.getOrElse(term,0.0) + alpha
        val sumDocSize = supermap_inv(cl)._2 + (alpha * vocabulary_size)
        Math.log(sumTf / sumDocSize.toDouble)
    }


/** Function to create a prediction
* Parameters: c - Map of classes to Set of Document IDs which have that class
              d - Map of document ID to (map of (word to count),document size, set of cllass labels)
  
*/

    
    // two functions to help with the prediction
    // _getFreq just creates term --> termfq map for a given document to be classified
    // _getLabels implements cut and threshold to produce a set of labels,
    // after the predict function has worked its magic
    
    // IN: Bag of Words, OUT: Term -> Term-frequency
    def _getFreq(doc: List[String]) = doc.groupBy(identity).mapValues(l => l.length).toMap
    
    // IN: Label -> Probability, OUT: Set of Labels as Prediction
    def _getLabels(res: MutMap[String, Double], threshold: Double, cut: Int) =
    {
       res.mapValues(x => x).filter(_._2 >= threshold).toSeq.sortBy(-_._2).take(cut).map(_._1).toSet
    }

    
    // The actual function called on a Bag of Words to produce a prediction
     def predict(tokens: List[String]) =
    {
        val doc = _getFreq(tokens)
        val terms = doc.keys
        
        var probs : MutMap[String, Double] = MutMap()
        var probs_inv : MutMap[String, Double] = MutMap()


        // for each class, this estimates the numerator
        for(cl <- prior)
            {
            // gets the prior probability for that class
            var prob = prior(cl._1)
      
            // for each term in my bag of words, get the probability of that term appearing in the document given class.
            // add all the log-probabilities together
            
            for(term <- terms) prob += doc(term)*getCondi(term, cl._1)
    
                // add the log-probability to my final map for that
                probs += cl._1 -> prob
            }
        
        // for each class, this estimates the denominator
        for(cl <- prior_inv)
            {
            // gets the prior probability for that class
            var prob_inv = prior_inv(cl._1)
      
            // for each term in my bag of words, get theprobability of that term appearing in the document given class.
            // add all the log-probabilitie together
            
            for(term <- terms) prob_inv += doc(term)*getCondi_inv(term, cl._1)
    
                // add the log-probability to my final map for tha
                probs_inv += cl._1 -> prob_inv
            }
        
        // this gives the estimated log-ratio of the probabilities for each class, is input to _getLabels
        val probabil = probs.map(t => t._1 -> (t._2 - probs_inv(t._1)))
        
        //probabil
        _getLabels(probabil, threshold, cut)
  }
}

In [None]:
// initiate a new estimator, and train the new estimator.
val estimator = new NBayes(0.5,5,1,5)
// train the estimator
estimator.train(class_to_doc,documents_tf_map)

In [None]:
// import bag of words to train classifier
val bag = MutMap[Int,List[String]]()
for (path <- test_list)
{
    var ID =  numPattern.findFirstIn(path.toString).get.toInt
    val currentdoc = new xml_doc(path.toString)
    val bowlist = currentdoc.token_stem.toList
    bag(ID) = bowlist   
}

In [None]:
// compute predictions and write them to file
val pred = bag.mapValues(x=>estimator.predict(x))
write_prediction(pred,"ir-2016-1-project-26-nb.txt")

In [None]:
// Use this cell to compute average_F1_scores on the validation set or any other set if you fancy
val bag = MutMap[Int,List[String]]()
val truth = MutMap[Int,Set[String]]()

for (path <- val_list)
{
    var ID =  numPattern.findFirstIn(path.toString).get.toInt
    val currentdoc = new xml_doc(path.toString)
    val bowlist = currentdoc.token_stem.toList
    val labels = currentdoc.labels.toSet
    bag(ID) = bowlist   
    truth(ID) = labels   
}
    
val scores = bag.keySet.toList.map(x=>assess(estimator.predict(bag(x)),truth(x)))
val f1_scores = scores.map(x=>x._1);
val avg_f1_score = f1_scores.sum/f1_scores.size