## Import modules

Set your user name as a case, pointing to your path to documents and tinyir.jar

In [1]:
// set your case once
val (doc_dir: String, files_path: String) = System.getProperties().get("user.name").toString match {
    case "Yarden-"  => ("../documents", "../")
    case "Max"  => ("../MAXPATH", "../MAXPATH/tinyir-1.1.jar")
}

[36mdoc_dir[0m: [32mString[0m = [32m"../documents"[0m
[36mfiles_path[0m: [32mString[0m = [32m"../"[0m

In [2]:
classpath.addPath(files_path + "tinyir-1.1.jar")



In [3]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [4]:
import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mscala.io.Source[0m
[32mimport [36mjava.io._[0m

In [5]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}
// enables "mutable lists"
// import scala.collection.mutable.ListBuffer  
import scala.collection.mutable.{Set => MutSet}

[32mimport [36mscala.collection.mutable.{Map => MutMap, HashMap => MutHashMap}[0m
[32mimport [36mscala.collection.mutable.{Set => MutSet}[0m

## Define classes and functions

In [6]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [7]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    def tokens() = {
        token_filter(head() ++ text())
    }
    
    def hash_tokens() = {
        tokens().map(x => x.hashCode())
    }
}

defined [32mclass [36mxml_doc[0m

In [8]:
def list_docs (path: String) = {  // : Array[java.io.File]
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

defined [32mfunction [36mlist_docs[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+

In [39]:
def write_int_to_intList(data: MutHashMap[Int, List[Int]], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_int_string(data: MutHashMap[Int, String], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(""))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_string_int(data: MutHashMap[String, Int], filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        bw.write(elem+" "+data(elem).toString)
        bw.newLine
    }   
    bw.close()
}

defined [32mfunction [36mwrite_int_to_intList[0m
defined [32mfunction [36mwrite_int_string[0m
defined [32mfunction [36mwrite_string_int[0m

# Importing data files

In [10]:
val train_list = list_docs(doc_dir)

[36mtrain_list[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"""
..\documents\AP880212-0006
  """[0m,
  [32m"""
..\documents\AP880212-0007
  """[0m,
  [32m"""
..\documents\AP880212-0009
  """[0m,
  [32m"""
..\documents\AP880212-0017
  """[0m,
  [32m"""
..\documents\AP880212-0018
  """[0m,
  [32m"""
..\documents\AP880212-0022
  """[0m,
  [32m"""
[33m...[0m

In [11]:
val token_hash = MutHashMap[String, Int]() // token -> hash

// List("word1", "word3").map(x => token_hm.getOrElseUpdate(x, token_hm.size))

[36mtoken_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()

In [12]:
def create_hash_doc_subset(star_count: Int, end_count: Int,
                           file_list: Array[String],
                           token_hash_map: MutHashMap[String, Int] = token_hash) = {
    val id_htoken = MutHashMap[Int, List[Int]]() // forward index, docID to tokens
    val htoken_id = MutHashMap[Int, List[Int]]()  // inverse index, tokens to docID
    val id_name = MutHashMap[Int, String]()  // inverse index, tokens to docID
    val name_id = MutHashMap[String, Int]()  // inverse index, tokens to docID
    var counter = star_count
    while (counter < end_count){
        var cur_doc = new xml_doc(file_list(counter))
        // get token from XML, then hash, or create hashes "on the fly"
        var cur_htoken = cur_doc.tokens.map(x => token_hash_map.getOrElseUpdate(x, token_hash_map.size))
        id_htoken += counter -> cur_htoken
        
        // update the inverse mapping, from (hashed) tokens to docID
        cur_htoken.distinct.foreach(
            (token: Int) => htoken_id(token) = htoken_id.getOrElseUpdate(token, List[Int]()) ++ List(counter)
        )
        
        id_name(counter) = cur_doc.id
        name_id(cur_doc.id) = counter
        
        counter += 1
        if (counter % 100 == 0) println(s"iteration $counter")
    }
    (id_htoken, htoken_id, token_hash_map, id_name, name_id)
}

defined [32mfunction [36mcreate_hash_doc_subset[0m

In [13]:
val (id_htoken, htoken_id, token_hash, 
     id_name, name_id) = create_hash_doc_subset(0, 10000, train_list)

iteration 100
iteration 200
iteration 300
iteration 400
iteration 500
iteration 600
iteration 700
iteration 800
iteration 900
iteration 1000
iteration 1100
iteration 1200
iteration 1300
iteration 1400
iteration 1500
iteration 1600
iteration 1700
iteration 1800
iteration 1900
iteration 2000
iteration 2100
iteration 2200
iteration 2300
iteration 2400
iteration 2500
iteration 2600
iteration 2700
iteration 2800
iteration 2900
iteration 3000
iteration 3100
iteration 3200
iteration 3300
iteration 3400
iteration 3500
iteration 3600
iteration 3700
iteration 3800
iteration 3900
iteration 4000
iteration 4100
iteration 4200
iteration 4300
iteration 4400
iteration 4500
iteration 4600
iteration 4700
iteration 4800
iteration 4900
iteration 5000
iteration 5100
iteration 5200
iteration 5300
iteration 5400
iteration 5500
iteration 5600
iteration 5700
iteration 5800
iteration 5900
iteration 6000
iteration 6100
iteration 6200
iteration 6300
iteration 6400
iteration 6500
iteration 6600
iteration 6700
iter

[36mid_htoken[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m(
  [32m6938[0m -> [33mList[0m(
    [32m1552[0m,
    [32m576[0m,
    [32m4841[0m,
    [32m30596[0m,
    [32m5265[0m,
    [32m324[0m,
    [32m432[0m,
    [32m519[0m,
    [32m8376[0m,
    [32m159573[0m,
    [32m144[0m,
    [32m2954[0m,
    [32m1071[0m,
    [32m1498[0m,
    [32m25[0m,
    [32m725[0m,
    [32m2431[0m,
    [32m297[0m,
[33m...[0m
[36mhtoken_id[0m: [32mcollection[0m.[32mmutable[0m.[32mHashMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m(
  [32m67471[0m -> [33mList[0m([32m2155[0m),
  [32m41623[0m -> [33mList[0m([32m1098[0m, [32m2642[0m, [32m4156[0m, [32m6968[0m, [32m7596[0m, [32m7653[0m),
  [32m178569[0m -> [33mList[0m([32m8070[0m),
  [32m12572[0m -> [33mList[0m(
    [32m205[0m,
    [32m496[0m,
    [32m527[0m,
    [32m991[0m,
    [32m1128[0m,

## Save to file

In [42]:
val PATH_id_htoken = files_path + "id_htoken.txt"
write_int_to_intList(id_htoken, PATH_id_htoken)

val PATH_htoken_id = files_path + "htoken_id.txt"
write_int_to_intList(htoken_id, PATH_htoken_id)

val PATH_id_name = files_path + "id_name.txt"
write_int_string(id_name, PATH_id_name)

val PATH_name_id = files_path + "name_id.txt"
write_string_int(name_id, PATH_name_id)

val PATH_token_hash = files_path + "token_hash.txt"
write_string_int(token_hash, PATH_token_hash)

[36mPATH_id_htoken[0m: [32mString[0m = [32m"../id_htoken.txt"[0m
[36mPATH_htoken_id[0m: [32mString[0m = [32m"../htoken_id.txt"[0m
[36mPATH_id_name[0m: [32mString[0m = [32m"../id_name.txt"[0m
[36mPATH_name_id[0m: [32mString[0m = [32m"../name_id.txt"[0m
[36mPATH_token_hash[0m: [32mString[0m = [32m"../token_hash.txt"[0m

## Load from file

In [43]:
def load_mutmap_int_intList(path: String, mutmap: MutMap[Int, List[Int]]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
        val line_split = line.split(" ", -1)
        mutmap(line_split.head.toInt) = // List(111)
            line_split.tail.map(x => x.toInt).toList
    }
}

def load_mutmap_int_string(path: String, mutmap: MutMap[Int, String]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
//         val line_split = line.split(" ", -1)
        val line_split = line.split(" ")
        mutmap(line_split.head.toInt) = // List(111)
            line_split.last
    }
}

def load_mutmap_string_int(path: String, mutmap: MutMap[String, Int]) = {
    val lines = Source.fromFile(path).getLines.toList
    for (line <- lines){
//         val line_split = line.split(" ", -1)
        val line_split = line.split(" ")
        mutmap(line_split.head) = // List(111)
            line_split.last.toInt
    }
}

defined [32mfunction [36mload_mutmap_int_intList[0m
defined [32mfunction [36mload_mutmap_int_string[0m
defined [32mfunction [36mload_mutmap_string_int[0m

In [44]:
val test_load_mutmap_id_htoken: MutMap[Int, List[Int]] = MutMap[Int, List[Int]]()
val test_load_mutmap_htoken_id: MutMap[Int, List[Int]] = MutMap[Int, List[Int]]()
val test_load_mutmap_id_name: MutMap[Int, String] = MutMap[Int, String]()
val test_load_mutmap_token_hash: MutMap[String, Int] = MutMap[String, Int]()
val test_load_mutmap_name_id: MutMap[String, Int] = MutMap[String, Int]()

[36mtest_load_mutmap_id_htoken[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mtest_load_mutmap_htoken_id[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()
[36mtest_load_mutmap_id_name[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mInt[0m, [32mString[0m] = [33mMap[0m()
[36mtest_load_mutmap_token_hash[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()
[36mtest_load_mutmap_name_id[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mString[0m, [32mInt[0m] = [33mMap[0m()

In [45]:
load_mutmap_int_intList(PATH_id_htoken, test_load_mutmap_id_htoken)
load_mutmap_int_intList(PATH_htoken_id, test_load_mutmap_htoken_id)
load_mutmap_int_string(PATH_id_name, test_load_mutmap_id_name)
load_mutmap_string_int(PATH_token_hash, test_load_mutmap_token_hash)
load_mutmap_string_int(PATH_name_id, test_load_mutmap_name_id)
// confirm load successful
test_load_mutmap_id_htoken == id_htoken
test_load_mutmap_htoken_id == htoken_id
test_load_mutmap_id_name == id_name
test_load_mutmap_token_hash == token_hash
test_load_mutmap_name_id == name_id

[36mres44_5[0m: [32mBoolean[0m = [32mtrue[0m
[36mres44_6[0m: [32mBoolean[0m = [32mtrue[0m
[36mres44_7[0m: [32mBoolean[0m = [32mtrue[0m
[36mres44_8[0m: [32mBoolean[0m = [32mtrue[0m
[36mres44_9[0m: [32mBoolean[0m = [32mtrue[0m

In [None]:
List(1,1,4,5,6).distinct

In [None]:
def create_doc_subset(star_count: Int, end_count: Int,
                     file_list: Array[String]) = {
    val doc_set = MutSet[processing.StringDocument]()
    var counter = star_count
    while (counter < end_count){
        var cur_doc = new xml_doc(file_list(counter))
        doc_set += (new processing.StringDocument(counter, cur_doc.tokens.mkString(" ")))
        counter += 1
        if (counter % 50 == 0) println(s"iteration $counter")
    }
    doc_set.toStream
}

In [None]:
val doc_sub_stream = create_doc_subset(0, 50000, train_list)

In [None]:
val test_PosIndex = new indexing.PosIndex(doc_sub_stream)

In [None]:
// start at 17:20

In [None]:
val doc_sub_stream_1 = create_doc_subset(0, 20000, train_list)

In [None]:
val doc_sub_stream_2 = create_doc_subset(20000, 40000, train_list)

In [None]:
val doc_sub_stream_3 = create_doc_subset(40000, 60000, train_list)

In [None]:
val doc_sub_stream_4 = create_doc_subset(60000, 80000, train_list)

In [None]:
val doc_sub_stream_5 = create_doc_subset(80000, 100000, train_list)

In [None]:
// started at 10:30
// finished at 11:51
// Total Memory: 2701 

In [None]:
val test_PosIndex = new indexing.PosIndex(doc_sub_stream_1 ++ 
                                          doc_sub_stream_2 ++ 
                                          doc_sub_stream_3 ++ 
                                          doc_sub_stream_4 ++ 
                                          doc_sub_stream_5)

In [None]:




val doc_set = MutSet[processing.StringDocument]()
var counter = 0
for (path <- train_list.take(100)){
// for (path <- train_list){
    var cur_doc = new xml_doc(path)
    doc_set += (new processing.StringDocument(counter, cur_doc.tokens.mkString(" ")))
    counter += 1
    if (counter % 50 == 0) println(s"iteration $counter")
}

In [None]:
val doc_stream = doc_set.toStream

In [None]:
val test_PosIndex = new indexing.PosIndex(doc_stream)

In [None]:
test_PosIndex.index
// test_pos_index.postings(doc_stream)

In [None]:
"iowa".hashCode()

In [None]:
test_PosIndex.results("iowa".hashCode().toString)

In [None]:
test_PosIndex.results("coupl")

In [None]:
indexing.ProxWindow.size

In [None]:
test_PosIndex.results(Seq("iowa", "coupl"))

In [None]:
indexing.ProxWindow.setSize(2)

In [None]:
test_PosIndex.results(Seq("iowa", "withkemp"))

In [None]:
val test_SimpleIndex = new indexing.SimpleIndex(doc_stream)

In [None]:
test_SimpleIndex.index
test_SimpleIndex.index.mapValues(x => x.length)

In [None]:
val test_FreqIndex = new indexing.FreqIndex(doc_stream)

In [None]:
test_FreqIndex.index
test_FreqIndex.results("iowa")

In [None]:
val test_InvertedIndex = indexing.InvertedIndex

#### creating list of files
#### indexing => creating inverse index
#### lectures(?) => look at scoring algorithms

# Testing ground

In [19]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)

Used Memory:  1063
Free Memory:  1260
Total Memory: 2323
Max Memory:   5461


[36mmb[0m: [32mInt[0m = [32m1048576[0m
[36mruntime[0m: [32mRuntime[0m = java.lang.Runtime@1566daf1

In [None]:
// Used Memory:  400
// Free Memory:  215
// Total Memory: 616
// Max Memory:   3641

In [None]:
val token_hm = MutHashMap[String, Int]()
List("word1", "word3").map(x => token_hm.getOrElseUpdate(x, token_hm.size))

In [None]:
classpath.addPath(tiny_path)

In [None]:
trait Result[T] extends Any {
    def id : Int
    def matches(that: T) : Int                 
    def isMatch(that: T) = matches(that)==0
    def matched(that: T) : T    
}

object InvertedIndex {
    // generic list intersection (does not require sorted lists)
    private def unsortedIntersect [A<% Result[A]](l1: List[A], l2: List[A]) = l1.intersect(l2)

    // optimized list intersection for sorted posting lists 
    // uses "matches" and "matched" methods to work for all posting types
    def sIntersect[A <% Result[A]] (l1: List[A], l2: List[A]) : List[A] = {
        @annotation.tailrec
        def iter (l1: List[A], l2: List[A], result: List[A]) : List[A] = {
            if (l1.isEmpty || l2.isEmpty) 
                result.reverse
            else (l1.head matches l2.head) match {
                case n if n>0 => iter(l1, l2.tail,result)  // advance list l2
                case n if n<0 => iter(l1.tail, l2,result)  // advance list l1
                case _        => iter(l1.tail, l2.tail, (l1.head matched l2.head)::result)	      
            }
        }    
        iter(l1,l2,Nil)      
    }
}

abstract class InvertedIndex[Res <% Result[Res]]  {
    def results (term: String) : List[Res] 
    def results (terms: Seq[String]) : List[Res] = {
        val resultLists      = terms.map(term => results(term))
        val shortToLongLists = resultLists.sortWith( _.length < _.length) 
        shortToLongLists.reduceLeft( (l1,l2) => InvertedIndex.sIntersect(l1,l2) )
    }
}

// import ch.ethz.dal.tinyir.indexing.InvertedIndex

In [None]:
import scala.math._

In [None]:
class Document(val id: Int, val tokens: List[Int])
//     def id: Int = this.id
//     def tokens: List[Int] = this.tokens

In [None]:
case class ProxResult(val id: Int, val lpos: Int, val rpos: Int) extends Result[ProxResult] {
    def matches(that: ProxResult) : Int = {    
        if (this.id != that.id) this.id - that.id
        else if ((max(rpos,that.rpos) - min(lpos,that.lpos)) <= ProxWindow.size) 0 // match
        else this.lpos-that.lpos  // advance in list with the minimal lpos
    }
    def matched(that: ProxResult) = 
        ProxResult(id, min(this.lpos,that.lpos), max(this.rpos,that.rpos))
}

object ProxWindow {
    var size = 1
    def setSize(w: Int) {assert(w>=1); size = w}
}

class PosIndex (docs: Stream[Document]) extends InvertedIndex[ProxResult] {

    case class PosPosting(val id: Int, val pos: Int) extends Ordered[PosPosting] {
        def this(t: PosTuple) = this(t.id, t.pos) 
//         def compare(that: PosPosting) = Ordering[Tuple2[Int, Int]].compare((this.id, this.pos), (that.id, that.pos) ) 
    }
    type PostList = List[PosPosting]
    val index : Map[String, PostList] = {
        val groupedPostings = postings(docs).groupBy(_.term)
        groupedPostings.mapValues(_.map(p => PosPosting(p.id,p.pos)).sorted)
    }
  
    case class PosTuple(term: String, id: Int, pos: Int) 
    def postings (s: Stream[Document]): List[PosTuple] =
        s.flatMap( d => d.tokens.zipWithIndex.map{ case (tk,pos) => PosTuple(tk,d.ID,pos) } ).toList

    override def results (word: String) : List[ProxResult] = 
        index.getOrElse(word,null).map(p => ProxResult(p.id, p.pos, p.pos))
    override def results (terms: Seq[String]) : List[ProxResult] = results(terms,1)
    def results (terms: Seq[String], win: Int) : List[ProxResult] = {
        val resultLists = terms.map(term => results(term))
        val shortToLongLists = resultLists.sortWith( _.length < _.length)   
        shortToLongLists.reduceLeft( (l1,l2) => InvertedIndex.sIntersect(l1,l2) )
    } 
}