## Import modules

Set your user name as a case, pointing to your path to documents and tinyir.jar

In [1]:
// set your case once
// doc_dir: location of xml documents
// files_path: all other files, including generated maps are stored / loaded from here
val (doc_dir: String, files_path: String) = System.getProperties().get("user.name").toString match {
    case "Yarden-"  => ("../documents", "../")
    case "Max"  => ("/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents", "/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/")
}

[36mdoc_dir[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents"[0m
[36mfiles_path[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/"[0m

In [2]:
classpath.addPath(files_path + "tinyir-1.1.jar")



In [3]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import ch.ethz.dal.tinyir.lectures._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mch.ethz.dal.tinyir.lectures._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [4]:
import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files

[32mimport [36mscala.io.Source[0m
[32mimport [36mjava.io._[0m

In [5]:
import scala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}

[32mimport [36mscala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}[0m

In [6]:
val timeit = new util.StopWatch

[36mtimeit[0m: [32mutil[0m.[32mStopWatch[0m = ch.ethz.dal.tinyir.util.StopWatch@6aaefab4

## Define classes and functions

In [7]:
def average[T]( ts: Iterable[T] )( implicit num: Numeric[T] ) = {
  num.toDouble( ts.sum ) / ts.size
}

defined [32mfunction [36maverage[0m

In [8]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [9]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    def tokens() = {
        token_filter(head() ++ text())
    }
}

defined [32mclass [36mxml_doc[0m

In [10]:
def list_docs (path: String) = {
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

defined [32mfunction [36mlist_docs[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+

In [11]:
// token -> hash (Int)
val token_hash = MutHashMap[String, Int]() // token -> hash

def create_hash_doc_subset(star_count: Int, end_count: Int,
                           file_list: Array[String],
                           token_hash_map: MutHashMap[String, Int] = token_hash) = {
    val id_htoken = MutHashMap[Int, List[Int]]() // forward index, docID to tokens
    val htoken_id = MutHashMap[Int, List[Int]]()  // inverse index, tokens to docID
    val id_name = MutHashMap[Int, String]()  // inverse index, tokens to docID
    val name_id = MutHashMap[String, Int]()  // inverse index, tokens to docID
    var counter = star_count
    while (counter < end_count){
        var cur_doc = new xml_doc(file_list(counter))
        // get token from XML, then hash, or create hashes "on the fly"
        var cur_htoken = cur_doc.tokens.map(x => token_hash_map.getOrElseUpdate(x, token_hash_map.size))
        id_htoken += counter -> cur_htoken
        
        // update the inverse mapping, from (hashed) tokens to docID
        cur_htoken.distinct.foreach(
            (token: Int) => htoken_id(token) = htoken_id.getOrElseUpdate(token, List[Int]()) ++ List(counter)
        )
        
        id_name(counter) = cur_doc.id
        name_id(cur_doc.id) = counter
        
        counter += 1
        if (counter % 100 == 0) println(s"iteration $counter")
    }
    (id_htoken, htoken_id, token_hash_map, id_name, name_id)
}

[36mtoken_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()
defined [32mfunction [36mcreate_hash_doc_subset[0m

In [12]:
// writing to file
def write_int_to_intList(data: MutHashMap[Int, List[Int]], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
//         if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
//         }    
    }   
    bw.close()
}

def write_int_string(data: MutHashMap[Int, String], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(""))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_string_int(data: MutHashMap[String, Int], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        bw.write(elem+" "+data(elem).toString)
        bw.newLine
    }   
    bw.close()
}

def write_int_to_int(data: MutHashMap[Int, Int], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var value = data(elem)
            bw.write(elem+" "+value)
            bw.newLine
    }   
    bw.close()
}

def write_lang_model_search(data: List[(String, Double, Int, Double)], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    
    data.foreach{
        case (log_opt, lambda, candidate_size, score) => bw.write(Seq(log_opt, lambda, candidate_size, score).mkString(" "))
        bw.newLine
    }
    bw.close()
}

// write results (ranking) to file
// model should be {"t", "l"}
def write_res(res: Map[String, List[String]],model: String="t") = {
    val file = new BufferedWriter(new FileWriter(new File("ranking-"+model+"-24.txt")))
    res.foreach{case (qId,doclist) => doclist.zipWithIndex // takes each qID, doclist pair to zip the list with an index
                .foreach{case(name,rank) => file.write(qId+" "+(rank+1)+" "+name+"\n")}} // self-explanatory
    file.close()
}

defined [32mfunction [36mwrite_int_to_intList[0m
defined [32mfunction [36mwrite_int_string[0m
defined [32mfunction [36mwrite_string_int[0m
defined [32mfunction [36mwrite_int_to_int[0m
defined [32mfunction [36mwrite_lang_model_search[0m
defined [32mfunction [36mwrite_res[0m

In [13]:
def load_mutmap_int_intList(path: String, mutmap: MutHashMap[Int, List[Int]]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ", -1).filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.tail.map(x => x.toInt).toList
    }
}

def load_mutmap_int_string(path: String, mutmap: MutHashMap[Int, String]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ") // .filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.last
    }
}

def load_mutmap_string_int(path: String, mutmap: MutHashMap[String, Int]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
//         val line_split = line.split(" ", -1)
        val line_split = line.split(" ") // .filter(_.trim.length > 0)
        mutmap(line_split.head) = 
            line_split.last.toInt
    }
}

def load_mutmap_int_int(path: String, mutmap: MutHashMap[Int, Int]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ", -1).filter(_.trim.length > 0)
        mutmap(line_split.head.toInt) = 
            line_split.last.toInt
    }
}

defined [32mfunction [36mload_mutmap_int_intList[0m
defined [32mfunction [36mload_mutmap_int_string[0m
defined [32mfunction [36mload_mutmap_string_int[0m
defined [32mfunction [36mload_mutmap_int_int[0m

In [14]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
def print_memory() = {
    println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
    println(s"Free Memory:  " + runtime.freeMemory / mb)
    println(s"Total Memory: " + runtime.totalMemory / mb)
    println(s"Max Memory:   " + runtime.maxMemory / mb)
}

[36mmb[0m: [32mInt[0m = [32m1048576[0m
[36mruntime[0m: [32mRuntime[0m = java.lang.Runtime@63d4150d
defined [32mfunction [36mprint_memory[0m

In [15]:
val train_list = list_docs(doc_dir)

[36mtrain_list[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0006"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0007"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0009"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0017"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0018"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0022"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0031"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0039"[0m,
  [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/documents/AP880212-0042"[0m,
  [32m"/Us

In [16]:
val PATH_id_htoken = files_path + "id_htoken.txt"
val PATH_htoken_id = files_path + "htoken_id.txt"
val PATH_id_name = files_path + "id_name.txt"
val PATH_name_id = files_path + "name_id.txt"
val PATH_token_hash = files_path + "token_hash.txt"

val PATH_prun_htoken_collectfreq = files_path + "prun_htoken_collectfreq.txt"
val PATH_prun_htoken_id = files_path + "prun_htoken_id.txt"
val PATH_prun_id_htoken = files_path + "prun_id_htoken.txt"

val PATH_lang_model_search = files_path + "lang_model_search.txt"

[36mPATH_id_htoken[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/id_htoken.txt"[0m
[36mPATH_htoken_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/htoken_id.txt"[0m
[36mPATH_id_name[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/id_name.txt"[0m
[36mPATH_name_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/name_id.txt"[0m
[36mPATH_token_hash[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/token_hash.txt"[0m
[36mPATH_prun_htoken_collectfreq[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/prun_htoken_collectfreq.txt"[0m
[36mPATH_prun_htoken_id[0m: [32mString[0m = [32m"/Users/Max/Coding/ETH/Information_Retrieval_AS16/scala_practice/files/prun_htoken_id.txt"[0m
[

# Importing data files and creating maps
# # not run

In [None]:
// time it
timeit.start

val (id_htoken, htoken_id, token_hash, 
     id_name, name_id) = create_hash_doc_subset(0, 100000, train_list)

In [None]:
// time it
timeit.uptonow / 60.0
// 87.56585954758334 , in minutes with 6GB

In [None]:
print_memory()

// Used Memory:  3981
// Free Memory:  1814
// Total Memory: 5796
// Max Memory:   5796

## Save to file

In [None]:
write_int_to_intList(id_htoken, PATH_id_htoken)

In [None]:
write_int_to_intList(htoken_id, PATH_htoken_id)

In [None]:
write_int_string(id_name, PATH_id_name)

In [None]:
write_string_int(name_id, PATH_name_id)

In [None]:
write_string_int(token_hash, PATH_token_hash)

## Load from file

In [None]:
// time it
timeit.start

val id_htoken: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val htoken_id: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val id_name: MutHashMap[Int, String] = MutHashMap[Int, String]()
val token_hash: MutHashMap[String, Int] = MutHashMap[String, Int]()
val name_id: MutHashMap[String, Int] = MutHashMap[String, Int]()

In [None]:
load_mutmap_int_intList(PATH_id_htoken, id_htoken)
load_mutmap_int_intList(PATH_htoken_id, htoken_id)
load_mutmap_int_string(PATH_id_name, id_name)
load_mutmap_string_int(PATH_token_hash, token_hash)
load_mutmap_string_int(PATH_name_id, name_id)

In [None]:
// time it
timeit.uptonow / 60.0
// 1.4485827969833334 , in minutes
// 1.3195113773833334 , in minutes

In [None]:
print_memory()

// Used Memory:  3468
// Free Memory:  649
// Total Memory: 4117
// Max Memory:   5461

## Prune vocabulary, collection and document frequencies

In [None]:
// htoken_id.mapValues(v => v.length).size
// 1356183
// htoken_id.mapValues(v => v.length).filter(_._2 > 5 - 1).size
// 176866
// reduction factor of ~7.67

val prun_threshold = 5
// prun tokens that have document freq >= 5
val pruned_token_set = htoken_id.mapValues(v => v.length).
    filter(_._2 > prun_threshold - 1).keys.toSet

In [None]:
// time it
timeit.start

val prun_htoken_collectfreq: MutHashMap[Int, Int] = 
    MutHashMap(
        id_htoken.flatMap{ case (k,v) => v.filter(pruned_token_set.contains(_)) }.
        groupBy(identity).mapValues(_.size)
        .toSeq:_*)

prun_htoken_collectfreq.size

timeit.uptonow / 60.0
// 7.0919255263  , in minutes
// 0.39257662956666667 , in minutes

In [None]:
// time it
timeit.start

val prun_htoken_id: MutHashMap[Int, List[Int]] = 
    MutHashMap(
        htoken_id.filterKeys(
            pruned_token_set.contains(_)
        ).toSeq:_*)

prun_htoken_id.size

timeit.uptonow / 60.0
// 0.012546176999999999 , in minutes

In [None]:
// time it
timeit.start

val prun_id_htoken: MutHashMap[Int, List[Int]] = 
    MutHashMap(
//         id_htoken.flatMap{ case (k,v) => (k, v.filter(pruned_token_set.contains(_))) }.
        id_htoken.mapValues{ v => v.filter(pruned_token_set.contains(_)) }.
        toSeq:_*)

prun_id_htoken.size

timeit.uptonow / 60.0
// 0.20076514550000002 , in minutes

## Save pruned results to file

In [None]:
write_int_to_int(prun_htoken_collectfreq, PATH_prun_htoken_collectfreq)

In [None]:
write_int_to_intList(prun_htoken_id, PATH_prun_htoken_id)

In [None]:
write_int_to_intList(prun_id_htoken, PATH_prun_id_htoken)

## Load maps (pruned)
## # start from here

In [17]:
// time it
timeit.start

val prun_htoken_collectfreq: MutHashMap[Int, Int] = MutHashMap[Int, Int]()
val prun_id_htoken: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val prun_htoken_id: MutHashMap[Int, List[Int]] = MutHashMap[Int, List[Int]]()
val id_name: MutHashMap[Int, String] = MutHashMap[Int, String]()
val token_hash: MutHashMap[String, Int] = MutHashMap[String, Int]()
val name_id: MutHashMap[String, Int] = MutHashMap[String, Int]()

[36mprun_htoken_collectfreq[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mInt[0m] = [33mMap[0m()
[36mprun_id_htoken[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mprun_htoken_id[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mid_name[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mString[0m] = [33mMap[0m()
[36mtoken_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()
[36mname_id[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()

In [18]:
load_mutmap_int_int(PATH_prun_htoken_collectfreq, prun_htoken_collectfreq)
load_mutmap_int_intList(PATH_prun_id_htoken, prun_id_htoken)
load_mutmap_int_intList(PATH_prun_htoken_id, prun_htoken_id)
load_mutmap_int_string(PATH_id_name, id_name)
load_mutmap_string_int(PATH_token_hash, token_hash)
load_mutmap_string_int(PATH_name_id, name_id)



In [19]:
// time it
timeit.uptonow / 60.0
// 2.1530588158666664 , in minutes

[36mres18[0m: [32mDouble[0m = [32m1.0760687918333334[0m

In [20]:
print_memory()

// Used Memory:  2569
// Free Memory:  1782
// Total Memory: 4352
// Max Memory:   5461

Used Memory:  2939
Free Memory:  701
Total Memory: 3641
Max Memory:   3641




# Queries & Evaluation

In [21]:
// requires: having added tinyir to classpath, having added the qrels, i.e. "relevance-judgements.csv" in root 
// builds truth, an object, whose only method .judgements("query-ID") returns the set of all document-IDs deemed 
// relevant to that query, note that these document-IDs are provided as List[String]
// observe that query-ID is a string of an integer between 51 and 90 -> 40 queries in total
val truth = new TipsterGroundTruth(files_path + "/relevance-judgements.csv")

// how to use it, example:
truth.judgements("51")
// observe that the size of relevant documents varies between queries, with the minimum being 52 and the maximum 894
truth.judgements.values.map(x => x.size).min
truth.judgements.values.map(x => x.size).max

[36mtruth[0m: [32mTipsterGroundTruth[0m = ch.ethz.dal.tinyir.lectures.TipsterGroundTruth@7850977d
[36mres20_1[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"AP8803010271"[0m,
  [32m"AP8803020275"[0m,
  [32m"AP8803110301"[0m,
  [32m"AP8803160292"[0m,
  [32m"AP8803180287"[0m,
  [32m"AP8803250293"[0m,
  [32m"AP8804060267"[0m,
  [32m"AP8804070258"[0m,
  [32m"AP8804120268"[0m,
  [32m"AP8804280301"[0m,
  [32m"AP8806270045"[0m,
  [32m"AP8806270093"[0m,
  [32m"AP8806280097"[0m,
  [32m"AP8806280170"[0m,
  [32m"AP8806280310"[0m,
  [32m"AP8807060311"[0m,
  [32m"AP8807310085"[0m,
  [32m"AP8809220206"[0m,
  [32m"AP8809260235"[0m,
[33m...[0m
[36mres20_2[0m: [32mInt[0m = [32m52[0m
[36mres20_3[0m: [32mInt[0m = [32m894[0m

In [22]:
// requires: having added the file "questions-descriptions.txt" to source
// This cell will build a list (can be Stream if required) of query tokens. 
// Note that the 16 is hard-coded to ignore the first 15 characters of these <title> line, which all read 
// "<title> Topic: "

val title = Source.fromFile(files_path +"questions-descriptions.txt").getLines().filter(_.startsWith("<title>"))
                .map(_.substring(16).trim).map(x => token_filter(x)).toList

val num = Source.fromFile(files_path +"questions-descriptions.txt").getLines().filter(_.startsWith("<num>"))   
                .map(x => numPattern.findFirstIn(x.toString).get.substring(1)).toList

val query = num zip title
query.sortBy(_._1) // the sorted order remains inherent to the object query (nice!!)

[36mtitle[0m: [32mList[0m[[32mList[0m[[32mString[0m]] = [33mList[0m(
  [33mList[0m([32m"airbu"[0m, [32m"subsidi"[0m),
  [33mList[0m([32m"south"[0m, [32m"african"[0m, [32m"sanction"[0m),
  [33mList[0m([32m"leverag"[0m, [32m"buyout"[0m),
  [33mList[0m([32m"satellit"[0m, [32m"launch"[0m, [32m"contract"[0m),
  [33mList[0m([32m"insid"[0m, [32m"trade"[0m),
  [33mList[0m([32m"prime"[0m, [32m"lend"[0m, [32m"rate"[0m, [32m"move"[0m, [32m"predict"[0m),
  [33mList[0m([32m"mci"[0m),
  [33mList[0m([32m"rail"[0m, [32m"strike"[0m),
  [33mList[0m([32m"weather"[0m, [32m"relat"[0m, [32m"fatal"[0m),
  [33mList[0m([32m"merit"[0m, [32m"pai"[0m, [32m"senior"[0m),
  [33mList[0m([32m"isra"[0m, [32m"role"[0m, [32m"iran"[0m, [32m"contra"[0m, [32m"affair"[0m),
  [33mList[0m([32m"militari"[0m, [32m"coup"[0m, [32m"etat"[0m),
  [33mList[0m([32m"machin"[0m, [32m"translat"[0m),
  [33mList[0m([32m"hostag"[0m

In [23]:
// Load the test queries. 
val test_title = Source.fromFile(files_path +"test-questions.txt").getLines().filter(_.startsWith("<title>"))
                .map(_.substring(16).trim).map(x => token_filter(x)).toList

val test_num = Source.fromFile(files_path +"test-questions.txt").getLines().filter(_.startsWith("<num>"))   
                .map(x => numPattern.findFirstIn(x.toString).get.substring(1)).toList

val test_query = test_num zip test_title
test_query.sortBy(_._1) // the sorted order remains inherent to the object query (nice!!)

[36mtest_title[0m: [32mList[0m[[32mList[0m[[32mString[0m]] = [33mList[0m(
  [33mList[0m([32m"armi"[0m, [32m"acquisit"[0m, [32m"advanc"[0m, [32m"weapon"[0m, [32m"system"[0m),
  [33mList[0m([32m"nternat"[0m, [32m"militari"[0m, [32m"equip"[0m, [32m"sale"[0m),
  [33mList[0m([32m"hat"[0m, [32m"back"[0m, [32m"nation"[0m, [32m"rifl"[0m, [32m"associ"[0m),
  [33mList[0m([32m"omput"[0m, [32m"aid"[0m, [32m"crime"[0m),
  [33mList[0m([32m"omput"[0m, [32m"aid"[0m, [32m"crime"[0m, [32m"detect"[0m),
  [33mList[0m([32m"omput"[0m, [32m"aid"[0m, [32m"medic"[0m, [32m"diagnosi"[0m),
  [33mList[0m([32m"iber"[0m, [32m"optic"[0m, [32m"applic"[0m),
  [33mList[0m([32m"iber"[0m, [32m"optic"[0m, [32m"equip"[0m, [32m"manufactur"[0m),
  [33mList[0m([32m"ran"[0m, [32m"contra"[0m, [32m"affair"[0m),
  [33mList[0m([32m"ontrol"[0m, [32m"transfer"[0m, [32m"high"[0m, [32m"technologi"[0m)
)
[36mtest_num[0m: [32mLi

In [24]:
// The object Inspector contains all functions required to calculate the evaluation metrics (Precision, Recall, 
// F1-Score and MAP (mean average precision))

object Inspector
{
// calculates average precision for a given answer (returned result of query)
def badass1(retriev2: List[String], relev: Array[String], bounded: Boolean=false): Double ={
    val retriev = retriev2.map(_.replace("-", ""))
    // remember to remove the "-" hyphens from the prediction for comparison purposes
    (retriev.map(relev.contains(_)) // produces a boolean list with true where element belongs to relevant
        .scanLeft(0){case (sum, next) => if(next) sum + 1 else sum}.tail // creates cumulative count of the booleans
        .zipWithIndex.map(x => x._1.toDouble / (x._2 + 1)) // calculates average precision for each element
        .zip(retriev.map(relev.contains(_))) // combines average precision with the boolean list from the start
        .filter(_._2) // to filter out the ones that are not relevant 
        .map(_._1).sum // calculates the numerator (sums up the precision for all elements that are relevant)
        )/ (if (bounded) retriev.size else relev.size) // divides by numerator (depending on bounded or not)
    }

// calculates mean average precision over a set of queries. 
def badass2(retriev_all: Map[String, List[String]], relev_all: Map[String, Array[String]], 
            bounded: Boolean=false): Double = {
    (retriev_all.map(x => Inspector.badass1(x._2,relev_all(x._1),bounded)) // calculate average precision for each query
    .sum)/(retriev_all.size) // calculates mean average precision (average precision over all queries)    
}
// Classic Precision and Recall for a given query, not striclty necessary. 
def evaluate(retriev: List[String], relev: Array[String])={
    val TP = retriev.filter(relev.contains(_)).size.toDouble
    val precision = TP / retriev.size
    val recall = TP / relev.size
    (precision,recall)
}
def recall1(retriev2: List[String],relev: Array[String]): Double = {
        val retriev = retriev2.map(_.replace("-", ""))
        val TP = relev.filter(retriev.contains(_)).size.toDouble
        TP / relev.size
}

def recall2(retriev_all: Map[String, List[String]],relev_all: Map[String, Array[String]]) = {
        retriev_all.map(x => Inspector.recall1(x._2,relev_all(x._1)))
}
}

defined [32mobject [36mInspector[0m

## Term-Frequency Model

In [25]:
// Define auxillary functions for the term-frequency model


def hash_query(query: (String, List[String])) = {
    (query._1, query._2.map(x => token_hash.getOrElse(x,-1)).filter(prun_htoken_id.keys.toSet.contains(_)).toSet)
}

val corpus_size = prun_id_htoken.size
def get_idf(query: Set[Int]) = {
    query.map(x => x -> Math.log(corpus_size / prun_htoken_id(x).size)).toMap
}

// get term frequency in a specific document (doc)
def get_tf(query: Set[Int],doc: Int) = {
    prun_id_htoken(doc).filter(query.contains(_)).groupBy(identity).mapValues(_.size)
}

// get log_tf
def get_log_tf(query: Set[Int],doc: Int) = {
    prun_id_htoken(doc).filter(query.contains(_)).groupBy(identity).mapValues(x => Math.log(x.size))
}

// get tf-idf is defined as tf-idf = tf * idf
def get_tf_idf(query: Set[Int],doc:Int) = {
    get_tf(query,doc).map(x => x._1 -> x._2 * get_idf(query).getOrElse(x._1,0.toDouble))
}

val average_doc_size = (prun_id_htoken.mapValues(x => x.size).foldLeft(0)(_+_._2).toDouble / prun_id_htoken.size)

def get_tf_okapi(query: Set[Int],doc: Int,k: Double=1.2,b: Double=0.75) = {
    val dsize = prun_id_htoken(doc).size
    val avdsize = average_doc_size
    get_tf(query,doc).mapValues(x => (x * (k+1)) / (x + k*(1 - b + (b*(dsize/avdsize)))))
}

def get_idf_okapi(query: Set[Int]) = {
    query.map(x => x -> Math.log((corpus_size - prun_htoken_id(x).size + 0.5)/(prun_htoken_id(x).size+0.5))).toMap
}

defined [32mfunction [36mhash_query[0m
[36mcorpus_size[0m: [32mInt[0m = [32m100000[0m
defined [32mfunction [36mget_idf[0m
defined [32mfunction [36mget_tf[0m
defined [32mfunction [36mget_log_tf[0m
defined [32mfunction [36mget_tf_idf[0m
[36maverage_doc_size[0m: [32mDouble[0m = [32m413.69858[0m
defined [32mfunction [36mget_tf_okapi[0m
defined [32mfunction [36mget_idf_okapi[0m

In [26]:
// Simple tf-idf model
// Handle a Query --> take in a query, produce a ranking
def handle(query: (String, List[String])) = {
    val hashed_query = hash_query(query)
    val doc_set = hashed_query._2.map(x => prun_htoken_id(x)).flatten.toSet
    val ranking = doc_set.map(x => x -> get_tf_idf(hashed_query._2,x).values.sum).toSeq.sortBy(-_._2)
                    .take(100).map(x => x._1).toList    
    (query._1,ranking)
}

defined [32mfunction [36mhandle[0m

In [27]:
// Okapi BM25 model 
// https://en.wikipedia.org/wiki/Okapi_BM25
// Handle a Query --> take in a query, produce a ranking
def handle_okapi(query: (String, List[String]),k: Double=1.2,b: Double=0.75) = {
    val hashed_query = hash_query(query)
    val idf = get_idf_okapi(hashed_query._2)
    val doc_set = hashed_query._2.map(x => prun_htoken_id(x)).flatten.toSet
    
    val ranking = doc_set.map(x => x -> (get_tf_okapi(hashed_query._2,x,k,b).map(x => x._1 -> x._2 * idf(x._1))).values.sum)
                    .toSeq.sortBy(-_._2)
                    .take(100).map(x => x._1).toList    
    (query._1,ranking)
}

// This is a modification of that function to faciliate the comparison of retrieval time when an inverted index is 
// used versus no inverted index is used // the results this model returns are the same obviuosly. 
def handle_okapi_no_index(query: (String, List[String]),k: Double=1.2,b: Double=0.75) = {
    val hashed_query = hash_query(query)
    val idf = get_idf_okapi(hashed_query._2)
    val doc_set = prun_id_htoken.keySet
    
    val ranking = doc_set.map(x => x -> (get_tf_okapi(hashed_query._2,x,k,b).map(x => x._1 -> x._2 * idf(x._1))).values.sum)
                    .toSeq.sortBy(-_._2)
                    .take(100).map(x => x._1).toList    
    (query._1,ranking)
}

defined [32mfunction [36mhandle_okapi[0m
defined [32mfunction [36mhandle_okapi_no_index[0m

In [28]:
// Consider the training set performance
timeit.start
val answers_train = query.map(x => handle_okapi(x)).toMap.mapValues(_.map(x => id_name(x)))

println("TF: The average time per query on the training set is: " + ((timeit.uptonow / 60.0)/answers_train.size))
println("TF: The bounded MAP on the training set is: " + Inspector.badass2(answers_train,truth.judgements,bounded=true))

// TF: The average time per query on the training set is: 0.07255479411458333
// TF: The bounded MAP on the training set is: 0.3212110318053528

TF: The average time per query on the training set is: 0.07255479411458333
TF: The bounded MAP on the training set is: 0.3212110318053528


[36manswers_train[0m: [32mMap[0m[[32mString[0m, [32mList[0m[[32mString[0m]] = [33mMap[0m(
  [32m"67"[0m -> [33mList[0m(
    [32m"WSJ880727-0154"[0m,
    [32m"WSJ870908-0085"[0m,
    [32m"AP891013-0071"[0m,
    [32m"WSJ911014-0122"[0m,
    [32m"WSJ891101-0074"[0m,
    [32m"AP880607-0214"[0m,
    [32m"AP900116-0109"[0m,
    [32m"AP881220-0009"[0m,
    [32m"WSJ880413-0114"[0m,
    [32m"WSJ880517-0073"[0m,
    [32m"AP880427-0078"[0m,
    [32m"WSJ890802-0116"[0m,
    [32m"AP890531-0190"[0m,
    [32m"AP881220-0046"[0m,
    [32m"WSJ880425-0077"[0m,
    [32m"AP880614-0123"[0m,
    [32m"WSJ900518-0117"[0m,
    [32m"WSJ861212-0091"[0m,
[33m...[0m

## Language model score

In [None]:
// // collection tf
val collection_size = prun_htoken_collectfreq.foldLeft(0.0)(_+_._2)
val collection_size_log = prun_htoken_collectfreq.foldLeft(0.0)(
    (res,value) => res + Math.log(1.0 + value._2.toDouble))

val pruned_token_set = prun_htoken_collectfreq.keys.toSet

// discards log scores (leaves ranked ID's and time)
def unfold_name_time(score :(List[(String, Double)], Double)) = {
    (score._1.unzip._1, score._2)
}

// returns id's of docs in which the most query tokens appear in
def reduce_candidate_doc(query: (String, List[Int]), 
                         prun_htoken_id: MutHashMap[Int, List[Int]] = prun_htoken_id, 
                         candidate_size: Int = 100) = {
    
    // doc_id -> # of occurances
    val doc_occurance = query._2.flatMap(token => prun_htoken_id(token)).
        groupBy(identity).mapValues(_.size)
    
    // sorted iterator of # of occurances
    val intersect_value = doc_occurance.values.toSet.toList.sorted.reverse.toIterator
    var iter = intersect_value.next
    
    var cur_doc_occurance = doc_occurance.filter(_._2 > iter - 1)
    
    // looks for minimal set that satisfies minimal set size
    while((cur_doc_occurance.size < candidate_size) & (intersect_value.hasNext)) {
        iter = intersect_value.next
        cur_doc_occurance = doc_occurance.filter(_._2 > iter - 1)
    }
    cur_doc_occurance.keys.toList
}

// (id, List(hashes_tokens))
val query_hash = query.map{ 
    case (id, str) => (id, str.
                       flatMap(x => token_hash.get(x)).filter(pruned_token_set.contains(_))
                      )}

val lambda = 0.01 // smoothing parameter

// set of all non-empty documents (i.e. that contain tokens)
val non_empty_id = prun_id_htoken.filter(_._2.size > 0).keys.toList

In [None]:
// operate on a single query
def lang_query(query: (String, List[Int]),
               method: String = "index", 
               log_opt: String = "tf",
               prun_htoken_collectfreq: MutHashMap[Int, Int] = prun_htoken_collectfreq,
               collection_size: Double = collection_size, 
               collection_size_log: Double = collection_size_log, 
               lambda: Double = lambda, 
               prun_id_htoken: MutHashMap[Int, List[Int]] = prun_id_htoken, 
               prun_htoken_id: MutHashMap[Int, List[Int]] = prun_htoken_id, 
               candidate_size: Int = 100, 
               take_k_results: Int = 100) = {
    
    // list of doc id's containing tokens in query
    def candidate_doc(): List[Int] = method match {
        case "index" => reduce_candidate_doc(query = query, candidate_size = candidate_size)
        case "no_index" => non_empty_id
//         case "test" => (1 to 2).toList
        case _ => throw new Exception("Please choose either 'index' or 'no_index'")
    }
        
    // map of tokens to frequency in a given doc
    def doc_tf_map(doc_id: Int) = log_opt match {
        case "tf" => prun_id_htoken(doc_id).
            groupBy(identity).mapValues(x => x.size.toDouble)
        case "log" => prun_id_htoken(doc_id).
            groupBy(identity).mapValues(x => Math.log(1.0+x.size))
        case _ => throw new Exception("Please choose either 'log' or 'tf'")
    }
    
    // number of tokens in doc
    def doc_size(doc_id: Int) = {
        doc_tf_map(doc_id).values.sum
    }

    // list of (relative) frequency of query tokens in a given doc
    def doc_query_tf(doc_id: Int) = {
        query._2.map(token => 
                     doc_tf_map(doc_id).getOrElse(token, 0.0) / doc_size(doc_id))
    }
    
    // list of (relative) frequency of query tokens in the collection
    def query_cf() = log_opt match {
        case "tf" => query._2.map(token => 
                                  prun_htoken_collectfreq(token).toDouble / collection_size)
        case "log" => query._2.map(token => 
                                  Math.log(1.0 + prun_htoken_collectfreq(token)) / collection_size_log)
    }
        
    // this only needs to be calculated once per query
    // (wasteful to call function multiple times)
    val cur_query_cf = query_cf()
    
    //
    def smooth_prob(doc_id: Int) = {
        doc_query_tf(doc_id).zip(cur_query_cf).
            map{case (x, y) => (1 - lambda) * x + lambda * y}
    }
    
    // sum log(x) elements of list
    def doc_lang_score(doc_id: Int) = {
        smooth_prob(doc_id).foldLeft(0.0)(_ + Math.log(_))
    }
    
    val log_scores = candidate_doc().map(doc => 
                                         (id_name(doc), doc_lang_score(doc))
                                         ).sortWith(_._2 > _._2)
    
    (log_scores.take(take_k_results), timeit.uptonow / 60.0)
    
}

In [None]:
// applies lang_query to all queries in the "training set"
// reports MAP and precision scores
def lang_model_results_MAP(query_hash: List[(String, List[Int])] = query_hash, 
                           method: String, 
                           log_opt: String, 
                           candidate_size: Int = 100, 
                           lambda: Double = lambda, 
                           truth: TipsterGroundTruth = truth) = {
    
    val lang_model_rank_time = query_hash.map(query => 
                           (query._1, unfold_name_time(
                               lang_query(query, 
                                          method = method, 
                                          log_opt = log_opt, 
                                          candidate_size = candidate_size, 
                                          lambda = lambda))
                           )
                                             ).toMap
    
    val lang_model_time = average(lang_model_rank_time.values.map(x => x._2))
    val lang_model_rank = lang_model_rank_time.mapValues(x => x._1)
    println(f"Average time per query is ${lang_model_time}%1.3f minutes")
    
    val MAP_score = Inspector.badass2(lang_model_rank,truth.judgements,bounded=true)
    println(f"MAP score is ${MAP_score}%1.3f")
    
    var precision = List[Double]()
    
    // (precision,recall)
    for (key <- lang_model_rank.keys) {
        var p_r = Inspector.evaluate(lang_model_rank(key).map(_.replace("-", "")),truth.judgements(key))
        precision ++= List(p_r._1)
    //     recall ++= List(p_r._2)
    }
    println(f"mean precision is ${average(precision)}%1.3f")
    
    MAP_score
}

In [None]:
// use example, with configurations chosen (after running the hyper parameter search)
lang_model_results_MAP(method = "index", 
                       log_opt = "tf", 
                       candidate_size = 1000, 
                       lambda = 0.1)

In [None]:
// searching through hyper-parameters and model configurations
var lang_model_search = List[(String, Double, Int, Double)]() // method, lambda, candidate_size, MAP_score

for (opt_iter <- Seq("log", "tf")){
    for (candidate_size_iter <- Seq(100, 1000)){
        for (lambda_iter <- (0 to 4).map(x => Math.pow(10, -x))){
            println(f"applying $opt_iter method with candidate size of $candidate_size_iter, lambda is set to ${lambda_iter}")
            lang_model_search ++= List((opt_iter, 
                                        lambda_iter, 
                                        candidate_size_iter, 
                                        lang_model_results_MAP(query_hash = query_hash, 
                                                               method = "index", 
                                                               log_opt = opt_iter, 
                                                               candidate_size = candidate_size_iter, 
                                                               lambda = lambda_iter)
                                        ))
        }
    }
}

// write results to file
write_lang_model_search(lang_model_search, PATH_lang_model_search)

# Rank test queries

In [None]:
// load test queries
val PATH_test_questions = "test-questions.txt"
val test_title = Source.fromFile(files_path + PATH_test_questions).getLines().filter(_.startsWith("<title>"))
                .map(_.substring(16).trim).map(x => token_filter(x)).toList

val test_num = Source.fromFile(files_path + PATH_test_questions).getLines().filter(_.startsWith("<num>"))   
                .map(x => numPattern.findFirstIn(x.toString).get.substring(1)).toList

val test_query = test_num zip test_title

In [35]:
// tf model ranking

// compare running time with and w/o indexing

// w indexing
timeit.start
val answers_test = test_query.map(x => handle_okapi(x)).toMap.mapValues(_.map(x => id_name(x)))
println("TF: Average time per query w indexing is: " + ((timeit.uptonow / 60.0)/answers_test.size))

// w/o indexing 
timeit.start
val answers_test_no_index = test_query.map(x => handle_okapi_no_index(x)).toMap.mapValues(_.map(x => id_name(x)))
println("TF: Average time per query w/o indexing is: " + ((timeit.uptonow / 60.0)/answers_test_no_index.size))

// TF: Average time per query w indexing is: 0.14082328266833335
// TF: Average time per query w indexing is: 0.19855570774

// TF: Average time per query w/o indexing is: 0.21297428135666668
// TF: Average time per query w/o indexing is: 0.29179896611333334


TF: Average time per query w indexing is: 0.14082328266833335
TF: Average time per query w/o indexing is: 0.21297428135666668


[36manswers_test[0m: [32mMap[0m[[32mString[0m, [32mList[0m[[32mString[0m]] = [33mMap[0m(
  [32m"98"[0m -> [33mList[0m(
    [32m"ZF109-452-641"[0m,
    [32m"DOE1-25-0952"[0m,
    [32m"ZF109-720-277"[0m,
    [32m"ZF207-519-494"[0m,
    [32m"ZF108-104-486"[0m,
    [32m"FR88915-0002"[0m,
    [32m"WSJ910528-0183"[0m,
    [32m"WSJ880610-0075"[0m,
    [32m"ZF108-719-392"[0m,
    [32m"DOE2-48-0109"[0m,
    [32m"FR89501-0011"[0m,
    [32m"WSJ910715-0071"[0m,
    [32m"WSJ890920-0035"[0m,
    [32m"DOE1-23-0542"[0m,
    [32m"ZF207-488-516"[0m,
    [32m"WSJ870304-0091"[0m,
    [32m"WSJ880607-0144"[0m,
    [32m"ZF207-827-034"[0m,
[33m...[0m
[36manswers_test_no_index[0m: [32mMap[0m[[32mString[0m, [32mList[0m[[32mString[0m]] = [33mMap[0m(
  [32m"98"[0m -> [33mList[0m(
    [32m"ZF109-452-641"[0m,
    [32m"DOE1-25-0952"[0m,
    [32m"ZF109-720-277"[0m,
    [32m"ZF207-519-494"[0m,
    [32m"ZF108-104-486"[0m,
    [32m"FR88915-

In [36]:
// saving to file
// Produce test-set ranking for tf-model
write_res(answers_test)



In [None]:
val test_query_hash = test_query.map{ 
    case (id, str) => (id, str.
                       flatMap(x => token_hash.get(x)).filter(pruned_token_set.contains(_))
                      )}

In [None]:
// language model ranking
// setting optimal config based on search
// compare running time with and w/o indexing
val lm_test_INDEX_rank_time = test_query_hash.map(query => 
                                                  (query._1, unfold_name_time(
                                                      lang_query(query, 
                                                                 method = "index", 
                                                                 log_opt = "tf", 
                                                                 candidate_size = 1000, 
                                                                 lambda = 0.1))
                                                  )
                                                 ).toMap

val lm_test_INDEX_time = lm_test_INDEX_rank_time.values.map(x => x._2)
val lm_test_INDEX_rank = lm_test_INDEX_rank_time.mapValues(x => x._1)
println(f"Average time per query is ${average(lm_test_INDEX_time)}%1.3f seconds")

// w/o indexing
val lm_test_NOINDEX_rank_time = test_query_hash.map(query => 
                                                  (query._1, unfold_name_time(
                                                      lang_query(query, 
                                                                 method = "no_index", 
                                                                 log_opt = "tf", 
                                                                 candidate_size = 1000, 
                                                                 lambda = 0.1))
                                                  )
                                                 ).toMap

val lm_test_NOINDEX_time = lm_test_NOINDEX_rank_time.values.map(x => x._2)
val lm_test_NOINDEX_rank = lm_test_NOINDEX_rank_time.mapValues(x => x._1)
println(f"Average time per query is ${average(lm_test_NOINDEX_time)}%1.3f seconds")

// Average time per query is 14.816 seconds
// Average time per query is 24.109 seconds

In [None]:
// saving to file
write_res(lm_test_INDEX_rank,"l")