## Import modules

Set your user name as a case, pointing to your path to documents and tinyir.jar

In [1]:
// set your case once
val (doc_dir: String, files_path: String) = System.getProperties().get("user.name").toString match {
    case "Yarden-"  => ("../documents", "../")
    case "Max"  => ("/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents", "/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/")
}

[36mdoc_dir[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents"[0m
[36mfiles_path[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/"[0m

In [2]:
classpath.addPath(files_path + "tinyir-1.1.jar")



In [3]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [4]:
import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mscala.io.Source[0m
[32mimport [36mjava.io._[0m

In [5]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}
// enables "mutable lists"
// import scala.collection.mutable.ListBuffer  
import scala.collection.mutable.{Set => MutSet}

[32mimport [36mscala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}[0m
[32mimport [36mscala.collection.mutable.{Set => MutSet}[0m

In [6]:
val timeit = new util.StopWatch

[36mtimeit[0m: [32mutil[0m.[32mStopWatch[0m = ch.ethz.dal.tinyir.util.StopWatch@4b520195

## Define classes and functions

In [7]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [8]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    def tokens() = {
        token_filter(head() ++ text())
    }
    
    def hash_tokens() = {
        tokens().map(x => x.hashCode())
    }
}

defined [32mclass [36mxml_doc[0m

In [9]:
def list_docs (path: String) = {  // : Array[java.io.File]
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

defined [32mfunction [36mlist_docs[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+

In [10]:
val token_hash = MutHashMap[String, Int]() // token -> hash

def create_hash_doc_subset(star_count: Int, end_count: Int,
                           file_list: Array[String],
                           token_hash_map: MutHashMap[String, Int] = token_hash) = {
    val id_htoken = MutHashMap[Int, List[Int]]() // forward index, docID to tokens
    val htoken_id = MutHashMap[Int, List[Int]]()  // inverse index, tokens to docID
    val id_name = MutHashMap[Int, String]()  // inverse index, tokens to docID
    val name_id = MutHashMap[String, Int]()  // inverse index, tokens to docID
    var counter = star_count
    while (counter < end_count){
        var cur_doc = new xml_doc(file_list(counter))
        // get token from XML, then hash, or create hashes "on the fly"
        var cur_htoken = cur_doc.tokens.map(x => token_hash_map.getOrElseUpdate(x, token_hash_map.size))
        id_htoken += counter -> cur_htoken
        
        // update the inverse mapping, from (hashed) tokens to docID
        cur_htoken.distinct.foreach(
            (token: Int) => htoken_id(token) = htoken_id.getOrElseUpdate(token, List[Int]()) ++ List(counter)
        )
        
        id_name(counter) = cur_doc.id
        name_id(cur_doc.id) = counter
        
        counter += 1
        if (counter % 100 == 0) println(s"iteration $counter")
    }
    (id_htoken, htoken_id, token_hash_map, id_name, name_id)
}

[36mtoken_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()
defined [32mfunction [36mcreate_hash_doc_subset[0m

In [11]:
def write_int_to_intList(data: MutHashMap[Int, List[Int]], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
//         if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
//         }    
    }   
    bw.close()
}

def write_int_string(data: MutHashMap[Int, String], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(""))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_string_int(data: MutHashMap[String, Int], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        bw.write(elem+" "+data(elem).toString)
        bw.newLine
    }   
    bw.close()
}

def write_int_to_int(data: MutHashMap[Int, Int], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var value = data(elem)
            bw.write(elem+" "+value)
            bw.newLine
    }   
    bw.close()
}

defined [32mfunction [36mwrite_int_to_intList[0m
defined [32mfunction [36mwrite_int_string[0m
defined [32mfunction [36mwrite_string_int[0m
defined [32mfunction [36mwrite_int_to_int[0m

In [12]:
def load_mutmap_int_intList(path: String, mutmap: MutHashMap[Int, List[Int]]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ", -1).filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.tail.map(x => x.toInt).toList
    }
}

def load_mutmap_int_string(path: String, mutmap: MutHashMap[Int, String]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ") // .filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.last
    }
}

def load_mutmap_string_int(path: String, mutmap: MutHashMap[String, Int]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
//         val line_split = line.split(" ", -1)
        val line_split = line.split(" ") // .filter(_.trim.length > 0)
        mutmap(line_split.head) = 
            line_split.last.toInt
    }
}

def load_mutmap_int_int(path: String, mutmap: MutHashMap[Int, Int]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ", -1).filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.last.toInt
    }
}

defined [32mfunction [36mload_mutmap_int_intList[0m
defined [32mfunction [36mload_mutmap_int_string[0m
defined [32mfunction [36mload_mutmap_string_int[0m
defined [32mfunction [36mload_mutmap_int_int[0m

In [13]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
def print_memory() = {
    println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
    println(s"Free Memory:  " + runtime.freeMemory / mb)
    println(s"Total Memory: " + runtime.totalMemory / mb)
    println(s"Max Memory:   " + runtime.maxMemory / mb)
}

[36mmb[0m: [32mInt[0m = [32m1048576[0m
[36mruntime[0m: [32mRuntime[0m = java.lang.Runtime@31294559
defined [32mfunction [36mprint_memory[0m

In [14]:
val train_list = list_docs(doc_dir)

[36mtrain_list[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0006"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0007"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0009"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0017"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0018"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0022"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0031"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0039"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0042"[0m,
  [32m"/Us

In [15]:
val PATH_id_htoken = files_path + "id_htoken.txt"
val PATH_htoken_id = files_path + "htoken_id.txt"
val PATH_id_name = files_path + "id_name.txt"
val PATH_name_id = files_path + "name_id.txt"
val PATH_token_hash = files_path + "token_hash.txt"

val PATH_prun_htoken_collectfreq = files_path + "prun_htoken_collectfreq.txt"
val PATH_prun_htoken_id = files_path + "prun_htoken_id.txt"
val PATH_prun_id_htoken = files_path + "prun_id_htoken.txt"

[36mPATH_id_htoken[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/id_htoken.txt"[0m
[36mPATH_htoken_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/htoken_id.txt"[0m
[36mPATH_id_name[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/id_name.txt"[0m
[36mPATH_name_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/name_id.txt"[0m
[36mPATH_token_hash[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/token_hash.txt"[0m
[36mPATH_prun_htoken_collectfreq[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/prun_htoken_collectfreq.txt"[0m
[36mPATH_prun_htoken_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/prun_htoken_id.txt"[0m
[

## Write Results to File

In [16]:
// use model="l" to save predictions for langauge model, i.e. t and l are required for the final submission
// but in theory can use anything for this parameter in case you want to save mulitple predictions in the same folder
// while trying out some different scoring and ranking approaches. 
def write_res(res: Map[String, List[String]],model: String="t") = {
    val file = new BufferedWriter(new FileWriter(new File("ranking-"+model+"-24.txt")))
    res.foreach{case (qId,doclist) => doclist.zipWithIndex // takes each qID, doclist pair to zip the list with an index
                .foreach{case(name,rank) => file.write(qId+" "+(rank+1)+" "+name+"\n")}} // self-explanatory
    file.close()
}

defined [32mfunction [36mwrite_res[0m

In [16]:
// Example Use (obviously need to have computed answers first)
write_res(answers) // saves answers term model predictions
write_res(answers,"l") // saves answers as language model predictions
write_res(answers,"abc") // saves answers with arbitrary name

: 

# Importing data files and creating maps
# # not run

In [None]:
// time it
timeit.start

val (id_htoken, htoken_id, token_hash, 
     id_name, name_id) = create_hash_doc_subset(0, 100000, train_list)

In [None]:
// time it
timeit.uptonow / 60.0
// 87.56585954758334 , in minutes with 6GB
// 67.88821773650001 , in minutes with 7GB

In [None]:
print_memory()

// Used Memory:  3981
// Free Memory:  1814
// Total Memory: 5796
// Max Memory:   5796

## Save to file

In [None]:
write_int_to_intList(id_htoken, PATH_id_htoken)

In [None]:
write_int_to_intList(htoken_id, PATH_htoken_id)

In [None]:
write_int_string(id_name, PATH_id_name)

In [None]:
write_string_int(name_id, PATH_name_id)

In [None]:
write_string_int(token_hash, PATH_token_hash)

## Load from file

In [None]:
// time it
timeit.start

val id_htoken: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val htoken_id: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val id_name: MutHashMap[Int, String] = MutHashMap[Int, String]()
val token_hash: MutHashMap[String, Int] = MutHashMap[String, Int]()
val name_id: MutHashMap[String, Int] = MutHashMap[String, Int]()

In [None]:
load_mutmap_int_intList(PATH_id_htoken, id_htoken)
load_mutmap_int_intList(PATH_htoken_id, htoken_id)
load_mutmap_int_string(PATH_id_name, id_name)
load_mutmap_string_int(PATH_token_hash, token_hash)
load_mutmap_string_int(PATH_name_id, name_id)

// confirm load successful
// test_load_mutmap_id_htoken == id_htoken
// test_load_mutmap_htoken_id == htoken_id
// test_load_mutmap_id_name == id_name
// test_load_mutmap_token_hash == token_hash
// test_load_mutmap_name_id == name_id

In [None]:
// time it
timeit.uptonow / 60.0
// 1.4485827969833334 , in minutes
// 1.3195113773833334 , in minutes

In [None]:
print_memory()

// Used Memory:  3468
// Free Memory:  649
// Total Memory: 4117
// Max Memory:   5461

## Prune vocabulary, collection and document frequencies

In [None]:
// htoken_id.mapValues(v => v.length).size
// 1356183
// htoken_id.mapValues(v => v.length).filter(_._2 > 5 - 1).size
// 176866
// reduction factor of ~7.67

val prun_threshold = 5
val pruned_token_set = htoken_id.mapValues(v => v.length).
    filter(_._2 > prun_threshold - 1).keys.toSet

In [None]:
// time it
timeit.start

val prun_htoken_collectfreq: MutHashMap[Int, Int] = 
    MutHashMap(
        id_htoken.flatMap{ case (k,v) => v.filter(pruned_token_set.contains(_)) }.
        groupBy(identity).mapValues(_.size)
        .toSeq:_*)

prun_htoken_collectfreq.size

timeit.uptonow / 60.0
// 7.0919255263  , in minutes
// 0.39257662956666667 , in minutes

In [None]:
// time it
timeit.start

val prun_htoken_id: MutHashMap[Int, List[Int]] = 
    MutHashMap(
        htoken_id.filterKeys(
            pruned_token_set.contains(_)
        ).toSeq:_*)

prun_htoken_id.size

timeit.uptonow / 60.0
// 0.012546176999999999 , in minutes

In [None]:
// time it
timeit.start

val prun_id_htoken: MutHashMap[Int, List[Int]] = 
    MutHashMap(
//         id_htoken.flatMap{ case (k,v) => (k, v.filter(pruned_token_set.contains(_))) }.
        id_htoken.mapValues{ v => v.filter(pruned_token_set.contains(_)) }.
        toSeq:_*)

prun_id_htoken.size

timeit.uptonow / 60.0
// 0.20076514550000002 , in minutes

## Save pruned results to file

In [None]:
write_int_to_int(prun_htoken_collectfreq, PATH_prun_htoken_collectfreq)

In [None]:
write_int_to_intList(prun_htoken_id, PATH_prun_htoken_id)

In [None]:
write_int_to_intList(prun_id_htoken, PATH_prun_id_htoken)

## Load maps (pruned)
## # start from here

In [17]:
// time it
timeit.start

val prun_htoken_collectfreq: MutHashMap[Int, Int] = MutHashMap[Int, Int]()
val prun_id_htoken: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val prun_htoken_id: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val id_name: MutHashMap[Int, String] = MutHashMap[Int, String]()
val token_hash: MutHashMap[String, Int] = MutHashMap[String, Int]()
val name_id: MutHashMap[String, Int] = MutHashMap[String, Int]()

[36mprun_htoken_collectfreq[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mInt[0m] = [33mMap[0m()
[36mprun_id_htoken[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mprun_htoken_id[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mid_name[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mString[0m] = [33mMap[0m()
[36mtoken_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()
[36mname_id[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()

In [18]:
load_mutmap_int_int(PATH_prun_htoken_collectfreq, prun_htoken_collectfreq)
load_mutmap_int_intList(PATH_prun_id_htoken, prun_id_htoken)
load_mutmap_int_intList(PATH_prun_htoken_id, prun_htoken_id)
load_mutmap_int_string(PATH_id_name, id_name)
load_mutmap_string_int(PATH_token_hash, token_hash)
load_mutmap_string_int(PATH_name_id, name_id)



In [19]:
// time it
timeit.uptonow / 60.0
// 2.1530588158666664 , in minutes

[36mres18[0m: [32mDouble[0m = [32m0.9153833900833334[0m

In [20]:
print_memory()

// Used Memory:  2569
// Free Memory:  1782
// Total Memory: 4352
// Max Memory:   5461

Used Memory:  2549
Free Memory:  1090
Total Memory: 3641
Max Memory:   3641




# Queries & Truth

In [21]:
// requires: having added tinyir to classpath, having added the qrels, i.e. "relevance-judgements.csv" in root 
// builds truth, an object, whose only method .judgements("query-ID") returns the set of all document-IDs deemed 
// relevant to that query, note that these document-IDs are provided as List[String]
// observe that query-ID is a string of an integer between 51 and 90 -> 40 queries in total
import ch.ethz.dal.tinyir.lectures._
val truth = new TipsterGroundTruth(files_path + "/relevance-judgements.csv")

// how to use it, example:
truth.judgements("51")
// observe that the size of relevant documents varies between queries, with the minimum being 52 and the maximum 894
truth.judgements.values.map(x => x.size).min
truth.judgements.values.map(x => x.size).max

[32mimport [36mch.ethz.dal.tinyir.lectures._[0m
[36mtruth[0m: [32mlectures[0m.[32mTipsterGroundTruth[0m = ch.ethz.dal.tinyir.lectures.TipsterGroundTruth@221d8d6f
[36mres20_2[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"AP8803010271"[0m,
  [32m"AP8803020275"[0m,
  [32m"AP8803110301"[0m,
  [32m"AP8803160292"[0m,
  [32m"AP8803180287"[0m,
  [32m"AP8803250293"[0m,
  [32m"AP8804060267"[0m,
  [32m"AP8804070258"[0m,
  [32m"AP8804120268"[0m,
  [32m"AP8804280301"[0m,
  [32m"AP8806270045"[0m,
  [32m"AP8806270093"[0m,
  [32m"AP8806280097"[0m,
  [32m"AP8806280170"[0m,
  [32m"AP8806280310"[0m,
  [32m"AP8807060311"[0m,
  [32m"AP8807310085"[0m,
  [32m"AP8809220206"[0m,
  [32m"AP8809260235"[0m,
[33m...[0m
[36mres20_3[0m: [32mInt[0m = [32m52[0m
[36mres20_4[0m: [32mInt[0m = [32m894[0m

In [22]:
// requires: having added the file "questions-descriptions.txt" to source
// This cell will build a list (can be Stream if required) of query tokens. 
// Note that the 16 is hard-coded to ignore the first 15 characters of these <title> line, which all read 
// "<title> Topic: "
import scala.io.Source
val numPattern = "[0-9]+".r

val title = Source.fromFile(files_path +"questions-descriptions.txt").getLines().filter(_.startsWith("<title>"))
                .map(_.substring(16).trim).map(x => token_filter(x)).toList

val num = Source.fromFile(files_path +"questions-descriptions.txt").getLines().filter(_.startsWith("<num>"))   
                .map(x => numPattern.findFirstIn(x.toString).get.substring(1)).toList

val query = num zip title
query.sortBy(_._1) // the sorted order remains inherent to the object query (nice!!)

[32mimport [36mscala.io.Source[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+
[36mtitle[0m: [32mList[0m[[32mList[0m[[32mString[0m]] = [33mList[0m(
  [33mList[0m([32m"airbu"[0m, [32m"subsidi"[0m),
  [33mList[0m([32m"south"[0m, [32m"african"[0m, [32m"sanction"[0m),
  [33mList[0m([32m"leverag"[0m, [32m"buyout"[0m),
  [33mList[0m([32m"satellit"[0m, [32m"launch"[0m, [32m"contract"[0m),
  [33mList[0m([32m"insid"[0m, [32m"trade"[0m),
  [33mList[0m([32m"prime"[0m, [32m"lend"[0m, [32m"rate"[0m, [32m"move"[0m, [32m"predict"[0m),
  [33mList[0m([32m"mci"[0m),
  [33mList[0m([32m"rail"[0m, [32m"strike"[0m),
  [33mList[0m([32m"weather"[0m, [32m"relat"[0m, [32m"fatal"[0m),
  [33mList[0m([32m"merit"[0m, [32m"pai"[0m, [32m"senior"[0m),
  [33mList[0m([32m"isra"[0m, [32m"role"[0m, [32m"iran"[0m, [32m"contra"[0m, [32m"affair"[0m),
  [33mList[0m([32m"militari"[

## Term-Frequency Model

In [119]:
// DEFINE AUXILLARY FUNCTIONS

// get inverse-document frequency (idf)
// is defined as the logarithmically scaled inverse fraction of the documents that contain the word, 
// obtained by dividing the total number of documents by the number of documents containing the term, and then 
// taking the logarithm of that quotient.

def hash_query(query: (String, List[String])) = {
    (query._1, query._2.map(x => token_hash.getOrElse(x,-1)).filter(prun_htoken_id.keys.toSet.contains(_)).toSet)
}

val corpus_size = prun_id_htoken.size
def get_idf(query: Set[Int]) = {
    query.map(x => x -> Math.log(corpus_size / prun_htoken_id(x).size)).toMap
}

// get term frequency in a specific document (doc)
def get_tf(query: Set[Int],doc: Int) = {
    prun_id_htoken(doc).filter(query.contains(_)).groupBy(identity).mapValues(_.size)
}

// get tf-idf is defined as tf-idf = tf * idf
def get_tf_idf(query: Set[Int],doc:Int) = {
    get_tf(query,doc).map(x => x._1 -> x._2 * get_idf(query).getOrElse(x._1,0.toDouble))
}

defined [32mfunction [36mhash_query[0m
[36mcorpus_size[0m: [32mInt[0m = [32m100000[0m
defined [32mfunction [36mget_idf[0m
defined [32mfunction [36mget_tf[0m
defined [32mfunction [36mget_tf_idf[0m

In [120]:
// Handle a Query --> take in a query, produce a ranking
def handle(query: (String, List[String])) = {
    val hashed_query = hash_query(query)
    val doc_set = hashed_query._2.flatMap(x => prun_htoken_id(x)).toSet
    val ranking = doc_set.map(x => x -> get_tf_idf(hashed_query._2,x).values.sum).toSeq.sortBy(-_._2)
                    .take(100).map(x => x._1).toList    
    (query._1,ranking)
}

defined [32mfunction [36mhandle[0m

In [125]:
// test it out.
hash_query(query(1))._2.flatMap(x => prun_htoken_id(x)).toSet.size
//handle(query(35))
timeit.start

timeit

[36mres124[0m: [32mInt[0m = [32m11210[0m

In [26]:
// Does it work on mass-answering queries?
// It takes nearly 8 minutes though, so about 12 seconds per query on average. 
// Potential speed improvements: Write a function that reduces document collection in the first place.
timeit.start
val answers = query.map(x => handle(x)).toMap.mapValues(_.map(x => id_name(x)))
timeit.uptonow / 60.0

[36manswers[0m: [32mMap[0m[[32mString[0m, [32mList[0m[[32mString[0m]] = [33mMap[0m(
  [32m"67"[0m -> [33mList[0m(
    [32m"FR88907-0036"[0m,
    [32m"FR88208-0014"[0m,
    [32m"FR89123-0023"[0m,
    [32m"FR89818-0017"[0m,
    [32m"FR881107-0026"[0m,
    [32m"FR88514-0001"[0m,
    [32m"FR88602-0011"[0m,
    [32m"FR88831-0149"[0m,
    [32m"FR88516-0128"[0m,
    [32m"FR88728-0019"[0m,
    [32m"FR88511-0168"[0m,
    [32m"FR891013-0117"[0m,
    [32m"FR88830-0023"[0m,
    [32m"FR88829-0023"[0m,
    [32m"FR88211-0266"[0m,
    [32m"FR881206-0016"[0m,
    [32m"FR891025-0107"[0m,
    [32m"FR881006-0001"[0m,
[33m...[0m
[36mres25_2[0m: [32mDouble[0m = [32m7.059047356016666[0m

## Evaluation

In [94]:
// The object Inspector contains all functions required to calculate the evaluation metrics (Precision, Recall, 
// F1-Score and MAP (mean average precision))

object Inspector
{
// calculates average precision for a given answer (returned result of query)
def badass1(retriev2: List[String], relev: Array[String], bounded: Boolean=false): Double ={
    val retriev = retriev2.map(_.replace("-", ""))
    // remember to remove the "-" hyphens from the prediction for comparison purposes
    (retriev.map(relev.contains(_)) // produces a boolean list with true where element belongs to relevant
        .scanLeft(0){case (sum, next) => if(next) sum + 1 else sum}.tail // creates cumulative count of the booleans
        .zipWithIndex.map(x => x._1.toDouble / (x._2 + 1)) // calculates average precision for each element
        .zip(retriev.map(relev.contains(_))) // combines average precision with the boolean list from the start
        .filter(_._2) // to filter out the ones that are not relevant 
        .map(_._1).sum // calculates the numerator (sums up the precision for all elements that are relevant)
        )/ (if (bounded) retriev.size else relev.size) // divides by numerator (depending on bounded or not)
    }

// calculates mean average precision over a set of queries. 
def badass2(retriev_all: Map[String, List[String]], relev_all: Map[String, Array[String]], 
            bounded: Boolean=false): Double = {
    (retriev_all.map(x => Inspector.badass1(x._2,relev_all(x._1),bounded)) // calculate average precision for each query
    .sum)/(retriev_all.size) // calculates mean average precision (average precision over all queries)    
}
// Classic Precision and Recall for a given query, not striclty necessary. 
def evaluate(retriev: List[String], relev: Array[String])={
    val TP = retriev.filter(relev.contains(_)).size.toDouble
    val precision = TP / retriev.size
    val recall = TP / relev.size
    (precision,recall)
}
def recall1(retriev2: List[String],relev: Array[String]): Double = {
        val retriev = retriev2.map(_.replace("-", ""))
        val TP = relev.filter(retriev.contains(_)).size.toDouble
        TP / relev.size
}

def recall2(retriev_all: Map[String, List[String]],relev_all: Map[String, Array[String]]) = {
        retriev_all.map(x => x._1 -> Inspector.recall1(x._2,relev_all(x._1)))
}

def tps1(retriev2: List[String],relev: Array[String]): Double = {
        val retriev = retriev2.map(_.replace("-", ""))
        val TP = relev.filter(retriev.contains(_)).size.toDouble
        TP 
}


def tps2(retriev_all: Map[String, List[String]],relev_all: Map[String, Array[String]]) = {
        retriev_all.map(x => x._1 -> Inspector.tps1(x._2,relev_all(x._1)))
}
}


defined [32mobject [36mInspector[0m

In [28]:
// Example Usage
val query_ID = "51"
Inspector.badass1(answers(query_ID),truth.judgements(query_ID),bounded=true)
Inspector.badass2(answers,truth.judgements,bounded=true) // answers are my predictions, truth.judgements from tinyIR


[36mquery_ID[0m: [32mString[0m = [32m"51"[0m
[36mres27_1[0m: [32mDouble[0m = [32m0.11750883431017148[0m
[36mres27_2[0m: [32mDouble[0m = [32m0.07435778472378372[0m

In [29]:
// investigate which queries cause problems.
val query_ID = "67"
truth.judgements(query_ID).size // How many relevant documents exist for that query?

// Give a list of (AP, query_ID) for some easy investigation
answers.map(x => Inspector.badass1(x._2,truth.judgements(x._1),bounded=true)).zip(answers.keys)//.filter(_._1<0.1)//.size

// Look at a particular original query:
query.toMap.get(query_ID)

[36mquery_ID[0m: [32mString[0m = [32m"67"[0m
[36mres28_1[0m: [32mInt[0m = [32m534[0m
[36mres28_2[0m: [32mcollection[0m.[32mimmutable[0m.[32mIterable[0m[([32mDouble[0m, [32mString[0m)] = [33mList[0m(
  [33m[0m([32m0.0[0m, [32m"67"[0m),
  [33m[0m([32m0.004212364150565186[0m, [32m"66"[0m),
  [33m[0m([32m0.0[0m, [32m"89"[0m),
  [33m[0m([32m0.11750883431017148[0m, [32m"51"[0m),
  [33m[0m([32m0.0[0m, [32m"84"[0m),
  [33m[0m([32m0.0[0m, [32m"73"[0m),
  [33m[0m([32m0.6886297042598777[0m, [32m"78"[0m),
  [33m[0m([32m0.03887095023193616[0m, [32m"62"[0m),
  [33m[0m([32m1.4925373134328358E-4[0m, [32m"88"[0m),
  [33m[0m([32m0.2937763328953522[0m, [32m"77"[0m),
  [33m[0m([32m0.0[0m, [32m"90"[0m),
  [33m[0m([32m1.6666666666666666E-4[0m, [32m"56"[0m),
  [33m[0m([32m0.06267791926937441[0m, [32m"55"[0m),
  [33m[0m([32m5.88235294117647E-4[0m, [32m"68"[0m),
  [33m[0m([32m0.168030891265378[0m, 

## Making Improvements

In [70]:
def investigate1(query: (String, List[String])) = {
    val noranking = hash_query(query)._2.map(x => prun_htoken_id(x)).flatten.toSet.toList
    (query._1,noranking)
}
def investigate2(query: (String, List[String])) = {
    val noranking = hash_query(query)._2.map(x => prun_htoken_id(x)).toList.flatten.groupBy(identity).filter(_._2.size  >= query._2.size)
    .keys.toList
    (query._1,noranking)
}
def investigate3(query: (String, List[String])) = {
    val noranking = hash_query(query)._2.map(x => prun_htoken_id(x)).toList.flatten.groupBy(identity).filter(_._2.size  >= (query._2.size - 1))
    .keys.toList
    (query._1,noranking)
}

defined [32mfunction [36minvestigate1[0m
defined [32mfunction [36minvestigate2[0m
defined [32mfunction [36minvestigate3[0m

In [75]:
//val answers1 = query.map(x => investigate1(x)).toMap.mapValues(_.map(x => id_name(x)))
// val answers2 = query.map(x => investigate2(x)).toMap.mapValues(_.map(x => id_name(x)))
// val answers3 = query.map(x => investigate3(x)).toMap.mapValues(_.map(x => id_name(x)))

[36manswers1[0m: [32mMap[0m[[32mString[0m, [32mList[0m[[32mString[0m]] = [33mMap[0m(
  [32m"67"[0m -> [33mList[0m(
    [32m"FR88907-0036"[0m,
    [32m"FR89809-0102"[0m,
    [32m"ZF109-507-231"[0m,
    [32m"WSJ880201-0065"[0m,
    [32m"AP880301-0135"[0m,
    [32m"AP890327-0005"[0m,
    [32m"AP890917-0007"[0m,
    [32m"AP891109-0137"[0m,
    [32m"WSJ891026-0121"[0m,
    [32m"AP890529-0063"[0m,
    [32m"DOE2-67-1249"[0m,
    [32m"WSJ870114-0056"[0m,
    [32m"WSJ881006-0139"[0m,
    [32m"AP890112-0155"[0m,
    [32m"AP891120-0004"[0m,
    [32m"WSJ920213-0022"[0m,
    [32m"AP880726-0043"[0m,
    [32m"WSJ910724-0099"[0m,
[33m...[0m

In [97]:
val rec1 = Inspector.recall2(answers1,truth.judgements) // this is taking the union
val rec2 = Inspector.recall2(answers2,truth.judgements) // this is taking the intersection
val rec3 = Inspector.recall2(answers3,truth.judgements) // this allows to miss out on at most one token from the query

[36mrec1[0m: [32mMap[0m[[32mString[0m, [32mDouble[0m] = [33mMap[0m(
  [32m"67"[0m -> [32m0.5430711610486891[0m,
  [32m"66"[0m -> [32m0.9847715736040609[0m,
  [32m"89"[0m -> [32m0.8850574712643678[0m,
  [32m"51"[0m -> [32m0.9710144927536232[0m,
  [32m"84"[0m -> [32m0.9367088607594937[0m,
  [32m"73"[0m -> [32m0.6065573770491803[0m,
  [32m"78"[0m -> [32m0.9074074074074074[0m,
  [32m"62"[0m -> [32m0.9966329966329966[0m,
  [32m"88"[0m -> [32m1.0[0m,
  [32m"77"[0m -> [32m0.6014492753623188[0m,
  [32m"90"[0m -> [32m0.9398496240601504[0m,
  [32m"56"[0m -> [32m0.9954441913439636[0m,
  [32m"55"[0m -> [32m0.971604938271605[0m,
  [32m"68"[0m -> [32m0.764102564102564[0m,
  [32m"61"[0m -> [32m1.0[0m,
  [32m"83"[0m -> [32m0.7674050632911392[0m,
  [32m"79"[0m -> [32m0.9612068965517241[0m,
  [32m"72"[0m -> [32m0.42016806722689076[0m,
  [32m"59"[0m -> [32m0.6234887737478411[0m,
[33m...[0m
[36mrec2[0m: [32mMap[0m[

In [98]:
val tp1 = Inspector.tps2(answers1,truth.judgements) // this is taking the union
val tp2 = Inspector.tps2(answers2,truth.judgements) // this is taking the intersection
val tp3 = Inspector.tps2(answers3,truth.judgements) // this allows to miss out on at most one token from the query

[36mtp1[0m: [32mMap[0m[[32mString[0m, [32mDouble[0m] = [33mMap[0m(
  [32m"67"[0m -> [32m290.0[0m,
  [32m"66"[0m -> [32m194.0[0m,
  [32m"89"[0m -> [32m154.0[0m,
  [32m"51"[0m -> [32m134.0[0m,
  [32m"84"[0m -> [32m370.0[0m,
  [32m"73"[0m -> [32m111.0[0m,
  [32m"78"[0m -> [32m147.0[0m,
  [32m"62"[0m -> [32m296.0[0m,
  [32m"88"[0m -> [32m165.0[0m,
  [32m"77"[0m -> [32m83.0[0m,
  [32m"90"[0m -> [32m250.0[0m,
  [32m"56"[0m -> [32m874.0[0m,
  [32m"55"[0m -> [32m787.0[0m,
  [32m"68"[0m -> [32m149.0[0m,
  [32m"61"[0m -> [32m206.0[0m,
  [32m"83"[0m -> [32m485.0[0m,
  [32m"79"[0m -> [32m223.0[0m,
  [32m"72"[0m -> [32m50.0[0m,
  [32m"59"[0m -> [32m361.0[0m,
[33m...[0m
[36mtp2[0m: [32mMap[0m[[32mString[0m, [32mDouble[0m] = [33mMap[0m(
  [32m"67"[0m -> [32m1.0[0m,
  [32m"66"[0m -> [32m75.0[0m,
  [32m"89"[0m -> [32m1.0[0m,
  [32m"51"[0m -> [32m82.0[0m,
  [32m"84"[0m -> [32m0.0[0m,
  

In [99]:
for (k <- rec1.keys){
    println(k,truth.judgements(k).size,rec1(k),rec3(k),rec2(k))
}

(67,534,0.5430711610486891,0.00749063670411985,0.0018726591760299626)
(66,197,0.9847715736040609,0.7461928934010152,0.38071065989847713)
(89,174,0.8850574712643678,0.08045977011494253,0.005747126436781609)
(51,138,0.9710144927536232,0.9710144927536232,0.5942028985507246)
(84,395,0.9367088607594937,0.002531645569620253,0.0)
(73,183,0.6065573770491803,0.01092896174863388,0.0)
(78,162,0.9074074074074074,0.9074074074074074,0.9074074074074074)
(62,297,0.9966329966329966,0.8552188552188552,0.020202020202020204)
(88,165,1.0,0.05454545454545454,0.0)
(77,138,0.6014492753623188,0.6014492753623188,0.6014492753623188)
(90,266,0.9398496240601504,0.15789473684210525,0.0)
(56,878,0.9954441913439636,0.38610478359908884,0.12870159453302962)
(55,810,0.971604938271605,0.971604938271605,0.9012345679012346)
(68,195,0.764102564102564,0.015384615384615385,0.005128205128205128)
(61,206,1.0,0.7330097087378641,0.34951456310679613)
(83,632,0.7674050632911392,0.24367088607594936,0.0)
(79,232,0.9612068965517241,0.



In [100]:
val th = 0.5
rec1.filter(_._2 > th).size
rec3.filter(_._2 > th).size
rec2.filter(_._2 > th).size

[36mth[0m: [32mDouble[0m = [32m0.5[0m
[36mres99_1[0m: [32mInt[0m = [32m39[0m
[36mres99_2[0m: [32mInt[0m = [32m12[0m
[36mres99_3[0m: [32mInt[0m = [32m24[0m

In [102]:
for (k <- tp1.keys){
    println(k,truth.judgements(k).size,tp1(k).toInt,tp3(k).toInt,tp2(k).toInt)
}

(67,534,290,4,1)
(66,197,194,147,75)
(89,174,154,14,1)
(51,138,134,134,82)
(84,395,370,1,0)
(73,183,111,2,0)
(78,162,147,147,147)
(62,297,296,254,6)
(88,165,165,9,0)
(77,138,83,83,83)
(90,266,250,42,0)
(56,878,874,339,113)
(55,810,787,787,730)
(68,195,149,3,1)
(61,206,206,151,72)
(83,632,485,154,0)
(79,232,223,41,0)
(72,119,50,50,9)
(59,579,361,91,5)
(87,188,142,11,0)
(76,294,279,133,56)
(54,171,168,157,51)
(65,386,372,318,170)
(71,380,331,331,116)
(57,461,429,429,429)
(80,374,363,286,66)
(82,599,521,521,367)
(60,60,57,38,8)
(69,52,51,11,1)
(58,159,157,157,137)
(64,375,361,361,72)
(53,571,539,539,394)
(75,365,304,304,304)
(70,55,54,54,11)
(86,213,212,212,130)
(81,62,47,0,0)
(63,208,155,155,65)
(74,499,277,277,60)
(52,535,531,496,374)
(85,894,815,815,480)




In [111]:
val th = 100
tp1.filter(_._2 > th).size
tp3.filter(_._2 > th).size
tp2.filter(_._2 > th).size

[36mth[0m: [32mInt[0m = [32m100[0m
[36mres110_1[0m: [32mInt[0m = [32m34[0m
[36mres110_2[0m: [32mInt[0m = [32m24[0m
[36mres110_3[0m: [32mInt[0m = [32m13[0m

In [126]:
query.map(_._2.size).foreach{
    println(_)
}

2
3
2
3
2
5
1
2
3
3
5
3
2
2
3
3
4
5
4
2
2
2
4
2
1
3
1
1
4
3
5
2
3
6
2
2
4
4
5
6




## Language model score

In [None]:
// // collection tf
val collection_size = prun_htoken_collectfreq.foldLeft(0.0)(_+_._2)
val collection_size_log = prun_htoken_collectfreq.foldLeft(0.0)(
    (res,value) => res + Math.log(1.0 + value._2.toDouble))

val pruned_token_set = prun_htoken_collectfreq.keys.toSet

// discards log scores
def unfold_name_time(score :(List[(String, Double)], Double)) = {
    (score._1.unzip._1, score._2)
}

// returns id's of docs in which the most query tokens appear in
def reduce_candidate_doc(query: (String, List[Int]), 
                         prun_htoken_id: MutHashMap[Int, List[Int]] = prun_htoken_id, 
                         take_k_results: Int = 100) = {
    
    // doc_id -> # of occurances
    val doc_occurance = query._2.flatMap(token => prun_htoken_id(token)).
        groupBy(identity).mapValues(_.size)
    
    // sorted iterator of # of occurances
    val intersect_value = doc_occurance.values.toSet.toList.sorted.reverse.toIterator
    var iter = intersect_value.next
    
    var cur_doc_occurance = doc_occurance.filter(_._2 > iter - 1)
    
    while((cur_doc_occurance.size < take_k_results) & (intersect_value.hasNext)) {
        iter = intersect_value.next
        cur_doc_occurance = doc_occurance.filter(_._2 > iter - 1)
    }
    cur_doc_occurance.keys.toList
}

val query_hash = query.map{ 
    case (id, str) => (id, str.
                       flatMap(x => token_hash.get(x)).filter(pruned_token_set.contains(_))
                      )}

val lambda = 0.01 // smoothing parameter

In [None]:
val query_hash = query.map{ 
    case (id, str) => (id, str.
                       flatMap(x => token_hash.get(x)).filter(pruned_token_set.contains(_))
                      )}

In [None]:
def lang_query(query: (String, List[Int]),
               method: String = "index", 
               log_opt: String = "tf",
               prun_htoken_collectfreq: MutHashMap[Int, Int] = prun_htoken_collectfreq,
               collection_size: Double = collection_size, 
               collection_size_log: Double = collection_size_log, 
               lambda: Double = lambda, 
               prun_id_htoken: MutHashMap[Int, List[Int]] = prun_id_htoken, 
               prun_htoken_id: MutHashMap[Int, List[Int]] = prun_htoken_id) = {
    
    // list of doc id's containing tokens in query
    def candidate_doc(): List[Int] = method match {
        case "index" => reduce_candidate_doc(query = query, take_k_results = take_k_results)
        case "no_index" => (0 to 100000-1).toList
//         case "test" => (1 to 2).toList
        case _ => throw new Exception("Please choose either 'index' or 'no_index'")
//    def candidate_doc(): List[Int] = {
// //        query._2.flatMap(token => prun_htoken_id(token)).distinct
//        List(1,2,6,14,36,37,50,65,68)
    }
        
    // map of tokens to frequency in a given doc
    def doc_tf_map(doc_id: Int) = log_opt match {
        case "tf" => prun_id_htoken(doc_id).
            groupBy(identity).mapValues(x => x.size.toDouble)
        case "log" => prun_id_htoken(doc_id).
            groupBy(identity).mapValues(x => Math.log(1.0+x.size))
        case _ => throw new Exception("Please choose either 'log' or 'tf'")
    }
    
    // number of tokens in doc
    def doc_size(doc_id: Int) = {
        doc_tf_map(doc_id).values.sum
    }

    // list of (relative) frequency of query tokens in a given doc
    def doc_query_tf(doc_id: Int) = {
        query._2.map(token => 
                     doc_tf_map(doc_id).getOrElse(token, 0.0) / doc_size(doc_id))
    }
    
    // list of (relative) frequency of query tokens in the collection
    def query_cf() = log_opt match {
        case "tf" => query._2.map(token => 
                                  prun_htoken_collectfreq(token).toDouble / collection_size)
        case "log" => query._2.map(token => 
                                  Math.log(1.0 + prun_htoken_collectfreq(token)) / collection_size_log)
    }
        
    // this only needs to be calculated once per query
    // (wasteful to call function multiple times)
    val cur_query_cf = query_cf()
    
    //
    def smooth_prob(doc_id: Int) = {
        doc_query_tf(doc_id).zip(cur_query_cf).
            map{case (x, y) => (1 - lambda) * x + lambda * y}
    }
    
    // sum log(x) elements of list
    def doc_lang_score(doc_id: Int) = {
        smooth_prob(doc_id).foldLeft(0.0)(_ + Math.log(_))
    }
    
    candidate_doc().map(doc => 
                        (query._1, doc, doc_lang_score(doc))
                       ).sortWith(_._3 > _._3)
    
}

In [None]:
// with raw log scores
val lang_model_rank_time = query_hash.map(query => 
                       (query._1, unfold_name_time(
                           lang_query(query, method = "index", log_opt = "tf"))
                       )
                                                 ).toMap

val lang_model_time = lang_model_rank_time.values.map(x => x._2)
val lang_model_time_average = average(lang_model_time)

// before reducing candidates
// Iterable[Double] = List(
//   0.3791437873666667,
//   0.5371110624833333,
//   9.883984977816668,
//   1.4138989739999999,
//   2.41249985935
// )

val lang_model_rank = lang_model_rank_time.mapValues(x => x._1)

// Example Usage
Inspector.badass2(lang_model_rank,truth.judgements,bounded=true) // answers are my predictions, truth.judgements from tinyIR

// Inspector.evaluate(lang_model_rank("51").map(_.replace("-", "")),truth.judgements("51"))

// lang_model_rank.keys.toList.map(q_id => q_id)
var precision = List[Double]()
var recall = List[Double]()

// (precision,recall)
for (key <- lang_model_rank.keys) {
//     println(key)
    var p_r = Inspector.evaluate(lang_model_rank(key).map(_.replace("-", "")),truth.judgements(key))
    precision ++= List(p_r._1)
    recall ++= List(p_r._2)
}

println(f"mean precision is ${average(precision)}%1.3f")
// average(mean_recall)

In [None]:
def lang_model_results_MAP(query_hash: List[(String, List[Int])] = query_hash, 
                           method: String, 
                           log_opt: String, 
                           lambda: Double = lambda, 
                           truth: TipsterGroundTruth = truth) = {
    
    val lang_model_rank_time = query_hash.map(query => 
                           (query._1, unfold_name_time(
                               lang_query(query, 
                                          method = method, 
                                          log_opt = log_opt, 
                                          lambda = lambda))
                           )
                                             ).toMap
    
    val lang_model_time = average(lang_model_rank_time.values.map(x => x._2))
    val lang_model_rank = lang_model_rank_time.mapValues(x => x._1)
    println(f"Average time per query is ${lang_model_time}%1.3f seconds")
    
    val MAP_score = Inspector.badass2(lang_model_rank,truth.judgements,bounded=true)
    println(f"MAP score is ${MAP_score}%1.3f")
    
    var precision = List[Double]()
    
    // (precision,recall)
    for (key <- lang_model_rank.keys) {
        var p_r = Inspector.evaluate(lang_model_rank(key).map(_.replace("-", "")),truth.judgements(key))
        precision ++= List(p_r._1)
    //     recall ++= List(p_r._2)
    }
    println(f"mean precision is ${average(precision)}%1.3f")
    
    MAP_score
}

In [None]:
lang_model_results_MAP(method = "index", 
                       log_opt = "log", 
                       lambda = 0.01)

# Testing ground

In [None]:
// Used Memory:  400
// Free Memory:  215
// Total Memory: 616
// Max Memory:   3641

In [None]:
val token_hm = MutHashMap[String, Int]()
List("word1", "word3").map(x => token_hm.getOrElseUpdate(x, token_hm.size))

In [None]:
classpath.addPath(tiny_path)

In [None]:
trait Result[T] extends Any {
    def id : Int
    def matches(that: T) : Int                 
    def isMatch(that: T) = matches(that)==0
    def matched(that: T) : T    
}

object InvertedIndex {
    // generic list intersection (does not require sorted lists)
    private def unsortedIntersect [A<% Result[A]](l1: List[A], l2: List[A]) = l1.intersect(l2)

    // optimized list intersection for sorted posting lists 
    // uses "matches" and "matched" methods to work for all posting types
    def sIntersect[A <% Result[A]] (l1: List[A], l2: List[A]) : List[A] = {
        @annotation.tailrec
        def iter (l1: List[A], l2: List[A], result: List[A]) : List[A] = {
            if (l1.isEmpty || l2.isEmpty) 
                result.reverse
            else (l1.head matches l2.head) match {
                case n if n>0 => iter(l1, l2.tail,result)  // advance list l2
                case n if n<0 => iter(l1.tail, l2,result)  // advance list l1
                case _        => iter(l1.tail, l2.tail, (l1.head matched l2.head)::result)	      
            }
        }    
        iter(l1,l2,Nil)      
    }
}

abstract class InvertedIndex[Res <% Result[Res]]  {
    def results (term: String) : List[Res] 
    def results (terms: Seq[String]) : List[Res] = {
        val resultLists      = terms.map(term => results(term))
        val shortToLongLists = resultLists.sortWith( _.length < _.length) 
        shortToLongLists.reduceLeft( (l1,l2) => InvertedIndex.sIntersect(l1,l2) )
    }
}

// import ch.ethz.dal.tinyir.indexing.InvertedIndex

In [None]:
import scala.math._

In [None]:
class Document(val id: Int, val tokens: List[Int])
//     def id: Int = this.id
//     def tokens: List[Int] = this.tokens

In [None]:
case class ProxResult(val id: Int, val lpos: Int, val rpos: Int) extends Result[ProxResult] {
    def matches(that: ProxResult) : Int = {    
        if (this.id != that.id) this.id - that.id
        else if ((max(rpos,that.rpos) - min(lpos,that.lpos)) <= ProxWindow.size) 0 // match
        else this.lpos-that.lpos  // advance in list with the minimal lpos
    }
    def matched(that: ProxResult) = 
        ProxResult(id, min(this.lpos,that.lpos), max(this.rpos,that.rpos))
}

object ProxWindow {
    var size = 1
    def setSize(w: Int) {assert(w>=1); size = w}
}

class PosIndex (docs: Stream[Document]) extends InvertedIndex[ProxResult] {

    case class PosPosting(val id: Int, val pos: Int) extends Ordered[PosPosting] {
        def this(t: PosTuple) = this(t.id, t.pos) 
//         def compare(that: PosPosting) = Ordering[Tuple2[Int, Int]].compare((this.id, this.pos), (that.id, that.pos) ) 
    }
    type PostList = List[PosPosting]
    val index : Map[String, PostList] = {
        val groupedPostings = postings(docs).groupBy(_.term)
        groupedPostings.mapValues(_.map(p => PosPosting(p.id,p.pos)).sorted)
    }
  
    case class PosTuple(term: String, id: Int, pos: Int) 
    def postings (s: Stream[Document]): List[PosTuple] =
        s.flatMap( d => d.tokens.zipWithIndex.map{ case (tk,pos) => PosTuple(tk,d.ID,pos) } ).toList

    override def results (word: String) : List[ProxResult] = 
        index.getOrElse(word,null).map(p => ProxResult(p.id, p.pos, p.pos))
    override def results (terms: Seq[String]) : List[ProxResult] = results(terms,1)
    def results (terms: Seq[String], win: Int) : List[ProxResult] = {
        val resultLists = terms.map(term => results(term))
        val shortToLongLists = resultLists.sortWith( _.length < _.length)   
        shortToLongLists.reduceLeft( (l1,l2) => InvertedIndex.sIntersect(l1,l2) )
    } 
}