## Import modules

Set your user name as a case, pointing to your path to documents and tinyir.jar

In [1]:
// set your a case once
val (doc_dir: String, tiny_path: String) = System.getProperties().get("user.name").toString match {
    case "Yarden-"  => ("../documents", "../tinyir-1.1.jar")
    case "Max"  => ("../MAXPATH", "../MAXPATH/tinyir-1.1.jar")
}

[36mdoc_dir[0m: [32mString[0m = [32m"../documents"[0m
[36mtiny_path[0m: [32mString[0m = [32m"../tinyir-1.1.jar"[0m

In [2]:
classpath.addPath(tiny_path)



In [3]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [4]:
// import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mjava.io._[0m

In [5]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap}
// enables "mutable lists"
import scala.collection.mutable.ListBuffer  

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mscala.collection.mutable.ListBuffer[0m

## Define classes and functions

In [6]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty)
}

defined [32mfunction [36mtoken_filter[0m

In [7]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    def tokens() = {
        token_filter(head() ++ text())
    }
}

defined [32mclass [36mxml_doc[0m

In [8]:
def list_docs (path: String) = {  // : Array[java.io.File]
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

defined [32mfunction [36mlist_docs[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+

In [9]:
def write(data: MutMap[Double, (Double, Double, Double)],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).productIterator.toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_prediction(data: MutMap[Int, Set[String]],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

defined [32mfunction [36mwrite[0m
defined [32mfunction [36mwrite_prediction[0m

# Importing data files

In [10]:
val train_list = list_docs(doc_dir)

[36mtrain_list[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"""
..\documents\AP880212-0006
  """[0m,
  [32m"""
..\documents\AP880212-0007
  """[0m,
  [32m"""
..\documents\AP880212-0009
  """[0m,
  [32m"""
..\documents\AP880212-0017
  """[0m,
  [32m"""
..\documents\AP880212-0018
  """[0m,
  [32m"""
..\documents\AP880212-0022
  """[0m,
  [32m"""
[33m...[0m

In [11]:
val doc_list = new ListBuffer[processing.StringDocument]()
var counter = 0
for (path <- train_list.take(1000)){
// for (path <- train_list){
    var cur_doc = new xml_doc(path)
    doc_list += (new processing.StringDocument(counter, cur_doc.tokens.mkString(" ")))
    counter += 1
    if (counter % 50 == 0) println(s"iteration $counter")
}

iteration 50
iteration 100
iteration 150
iteration 200
iteration 250
iteration 300
iteration 350
iteration 400
iteration 450
iteration 500
iteration 550
iteration 600
iteration 650
iteration 700
iteration 750
iteration 800
iteration 850
iteration 900
iteration 950
iteration 1000


[36mdoc_list[0m: [32mListBuffer[0m[[32mprocessing[0m.[32mStringDocument[0m] = [33mListBuffer[0m(
  ch.ethz.dal.tinyir.processing.StringDocument@7ff295fb,
  ch.ethz.dal.tinyir.processing.StringDocument@370fb10f,
  ch.ethz.dal.tinyir.processing.StringDocument@45eb6a0e,
  ch.ethz.dal.tinyir.processing.StringDocument@500fc29d,
  ch.ethz.dal.tinyir.processing.StringDocument@3b39c82a,
  ch.ethz.dal.tinyir.processing.StringDocument@71f4918b,
  ch.ethz.dal.tinyir.processing.StringDocument@6fd12ea1,
  ch.ethz.dal.tinyir.processing.StringDocument@416412ec,
  ch.ethz.dal.tinyir.processing.StringDocument@3469ea72,
  ch.ethz.dal.tinyir.processing.StringDocument@61444dbd,
  ch.ethz.dal.tinyir.processing.StringDocument@706d312,
  ch.ethz.dal.tinyir.processing.StringDocument@6614526c,
  ch.ethz.dal.tinyir.processing.StringDocument@1dd50ea6,
  ch.ethz.dal.tinyir.processing.StringDocument@5e3f8104,
  ch.ethz.dal.tinyir.processing.StringDocument@6f904792,
  ch.ethz.dal.tinyir.processing.StringD

In [12]:
val doc_stream = doc_list.toStream

[36mdoc_stream[0m: [32mStream[0m[[32mprocessing[0m.[32mStringDocument[0m] = [33mStream[0m(
  ch.ethz.dal.tinyir.processing.StringDocument@7ff295fb,
  ch.ethz.dal.tinyir.processing.StringDocument@370fb10f,
  ch.ethz.dal.tinyir.processing.StringDocument@45eb6a0e,
  ch.ethz.dal.tinyir.processing.StringDocument@500fc29d,
  ch.ethz.dal.tinyir.processing.StringDocument@3b39c82a,
  ch.ethz.dal.tinyir.processing.StringDocument@71f4918b,
  ch.ethz.dal.tinyir.processing.StringDocument@6fd12ea1,
  ch.ethz.dal.tinyir.processing.StringDocument@416412ec,
  ch.ethz.dal.tinyir.processing.StringDocument@3469ea72,
  ch.ethz.dal.tinyir.processing.StringDocument@61444dbd,
  ch.ethz.dal.tinyir.processing.StringDocument@706d312,
  ch.ethz.dal.tinyir.processing.StringDocument@6614526c,
  ch.ethz.dal.tinyir.processing.StringDocument@1dd50ea6,
  ch.ethz.dal.tinyir.processing.StringDocument@5e3f8104,
  ch.ethz.dal.tinyir.processing.StringDocument@6f904792,
  ch.ethz.dal.tinyir.processing.StringDocumen

In [13]:
val test_PosIndex = new indexing.PosIndex(doc_stream)

[36mtest_PosIndex[0m: [32mindexing[0m.[32mPosIndex[0m = ch.ethz.dal.tinyir.indexing.PosIndex@7055e6e3

In [14]:
test_PosIndex.index
// test_pos_index.postings(doc_stream)

[36mres13[0m: [32mMap[0m[[32mString[0m, [32mtest_PosIndex[0m.[32mPostList[0m] = [33mMap[0m(
  [32m"inshort"[0m -> [33mList[0m([33mPosPosting[0m([32m744[0m, [32m302[0m)),
  [32m"wednesdayaft"[0m -> [33mList[0m([33mPosPosting[0m([32m758[0m, [32m15[0m)),
  [32m"andlithuania"[0m -> [33mList[0m([33mPosPosting[0m([32m41[0m, [32m61[0m)),
  [32m"forlett"[0m -> [33mList[0m([33mPosPosting[0m([32m861[0m, [32m211[0m)),
  [32m"incident"[0m -> [33mList[0m([33mPosPosting[0m([32m4[0m, [32m243[0m), [33mPosPosting[0m([32m330[0m, [32m194[0m)),
  [32m"meteorologist"[0m -> [33mList[0m([33mPosPosting[0m([32m296[0m, [32m307[0m), [33mPosPosting[0m([32m303[0m, [32m299[0m)),
  [32m"kidnei"[0m -> [33mList[0m(
    [33mPosPosting[0m([32m197[0m, [32m371[0m),
    [33mPosPosting[0m([32m223[0m, [32m371[0m),
    [33mPosPosting[0m([32m478[0m, [32m50[0m),
    [33mPosPosting[0m([32m684[0m, [32m31[0m)
  ),
  [32

In [15]:
test_PosIndex.results("iowa")

[36mres14[0m: [32mList[0m[[32mindexing[0m.[32mProxResult[0m] = [33mList[0m(
  [33mProxResult[0m([32m1[0m, [32m16[0m, [32m16[0m),
  [33mProxResult[0m([32m1[0m, [32m29[0m, [32m29[0m),
  [33mProxResult[0m([32m1[0m, [32m60[0m, [32m60[0m),
  [33mProxResult[0m([32m1[0m, [32m164[0m, [32m164[0m),
  [33mProxResult[0m([32m1[0m, [32m212[0m, [32m212[0m),
  [33mProxResult[0m([32m13[0m, [32m24[0m, [32m24[0m),
  [33mProxResult[0m([32m13[0m, [32m41[0m, [32m41[0m),
  [33mProxResult[0m([32m40[0m, [32m106[0m, [32m106[0m),
  [33mProxResult[0m([32m46[0m, [32m25[0m, [32m25[0m),
  [33mProxResult[0m([32m46[0m, [32m106[0m, [32m106[0m),
  [33mProxResult[0m([32m46[0m, [32m197[0m, [32m197[0m),
  [33mProxResult[0m([32m46[0m, [32m359[0m, [32m359[0m),
  [33mProxResult[0m([32m56[0m, [32m509[0m, [32m509[0m),
  [33mProxResult[0m([32m56[0m, [32m558[0m, [32m558[0m),
  [33mProxResult[0m([32m59[0m, 

In [16]:
test_PosIndex.results("coupl")

[36mres15[0m: [32mList[0m[[32mindexing[0m.[32mProxResult[0m] = [33mList[0m(
  [33mProxResult[0m([32m1[0m, [32m17[0m, [32m17[0m),
  [33mProxResult[0m([32m15[0m, [32m85[0m, [32m85[0m),
  [33mProxResult[0m([32m35[0m, [32m39[0m, [32m39[0m),
  [33mProxResult[0m([32m44[0m, [32m28[0m, [32m28[0m),
  [33mProxResult[0m([32m49[0m, [32m6[0m, [32m6[0m),
  [33mProxResult[0m([32m56[0m, [32m510[0m, [32m510[0m),
  [33mProxResult[0m([32m130[0m, [32m172[0m, [32m172[0m),
  [33mProxResult[0m([32m134[0m, [32m178[0m, [32m178[0m),
  [33mProxResult[0m([32m151[0m, [32m129[0m, [32m129[0m),
  [33mProxResult[0m([32m167[0m, [32m149[0m, [32m149[0m),
  [33mProxResult[0m([32m168[0m, [32m53[0m, [32m53[0m),
  [33mProxResult[0m([32m175[0m, [32m16[0m, [32m16[0m),
  [33mProxResult[0m([32m175[0m, [32m144[0m, [32m144[0m),
  [33mProxResult[0m([32m175[0m, [32m278[0m, [32m278[0m),
  [33mProxResult[0m([32m

In [17]:
indexing.ProxWindow.size

[36mres16[0m: [32mInt[0m = [32m1[0m

In [18]:
test_PosIndex.results(Seq("iowa", "coupl"))

[36mres17[0m: [32mList[0m[[32mindexing[0m.[32mProxResult[0m] = [33mList[0m([33mProxResult[0m([32m1[0m, [32m16[0m, [32m17[0m), [33mProxResult[0m([32m56[0m, [32m509[0m, [32m510[0m))

In [19]:
indexing.ProxWindow.setSize(2)



In [20]:
test_PosIndex.results(Seq("iowa", "withkemp"))

[36mres19[0m: [32mList[0m[[32mindexing[0m.[32mProxResult[0m] = [33mList[0m([33mProxResult[0m([32m1[0m, [32m16[0m, [32m18[0m))

In [21]:
val test_SimpleIndex = new indexing.SimpleIndex(doc_stream)

[36mtest_SimpleIndex[0m: [32mindexing[0m.[32mSimpleIndex[0m = ch.ethz.dal.tinyir.indexing.SimpleIndex@424a6dea

In [22]:
test_SimpleIndex.index
test_SimpleIndex.index.mapValues(x => x.length)

[36mres21_0[0m: [32mMap[0m[[32mString[0m, [32mtest_SimpleIndex[0m.[32mPostList[0m] = [33mMap[0m(
  [32m"inshort"[0m -> [33mList[0m([33mSimplePosting[0m([32m744[0m)),
  [32m"wednesdayaft"[0m -> [33mList[0m([33mSimplePosting[0m([32m758[0m)),
  [32m"andlithuania"[0m -> [33mList[0m([33mSimplePosting[0m([32m41[0m)),
  [32m"forlett"[0m -> [33mList[0m([33mSimplePosting[0m([32m861[0m)),
  [32m"incident"[0m -> [33mList[0m([33mSimplePosting[0m([32m4[0m), [33mSimplePosting[0m([32m330[0m)),
  [32m"meteorologist"[0m -> [33mList[0m([33mSimplePosting[0m([32m296[0m), [33mSimplePosting[0m([32m303[0m)),
  [32m"kidnei"[0m -> [33mList[0m(
    [33mSimplePosting[0m([32m197[0m),
    [33mSimplePosting[0m([32m223[0m),
    [33mSimplePosting[0m([32m478[0m),
    [33mSimplePosting[0m([32m684[0m)
  ),
  [32m"serious"[0m -> [33mList[0m([33mSimplePosting[0m([32m710[0m), [33mSimplePosting[0m([32m980[0m)),
  [32m"brink"

In [23]:
val test_FreqIndex = new indexing.FreqIndex(doc_stream)

[36mtest_FreqIndex[0m: [32mindexing[0m.[32mFreqIndex[0m = ch.ethz.dal.tinyir.indexing.FreqIndex@176f786a

In [24]:
test_FreqIndex.index
test_FreqIndex.results("iowa")

[36mres23_0[0m: [32mMap[0m[[32mString[0m, [32mtest_FreqIndex[0m.[32mPostList[0m] = [33mMap[0m(
  [32m"inshort"[0m -> [33mList[0m([33mFreqPosting[0m([32m744[0m, [32m1[0m)),
  [32m"wednesdayaft"[0m -> [33mList[0m([33mFreqPosting[0m([32m758[0m, [32m1[0m)),
  [32m"andlithuania"[0m -> [33mList[0m([33mFreqPosting[0m([32m41[0m, [32m1[0m)),
  [32m"forlett"[0m -> [33mList[0m([33mFreqPosting[0m([32m861[0m, [32m1[0m)),
  [32m"incident"[0m -> [33mList[0m([33mFreqPosting[0m([32m4[0m, [32m1[0m), [33mFreqPosting[0m([32m330[0m, [32m1[0m)),
  [32m"meteorologist"[0m -> [33mList[0m([33mFreqPosting[0m([32m296[0m, [32m1[0m), [33mFreqPosting[0m([32m303[0m, [32m1[0m)),
  [32m"kidnei"[0m -> [33mList[0m(
    [33mFreqPosting[0m([32m197[0m, [32m1[0m),
    [33mFreqPosting[0m([32m223[0m, [32m1[0m),
    [33mFreqPosting[0m([32m478[0m, [32m1[0m),
    [33mFreqPosting[0m([32m684[0m, [32m1[0m)
  ),
  [32m"ser

In [25]:
val test_InvertedIndex = indexing.InvertedIndex

[36mtest_InvertedIndex[0m: [32mindexing[0m.[32mInvertedIndex[0m.type = ch.ethz.dal.tinyir.indexing.InvertedIndex$@2d78cb8d

#### creating list of files
#### indexing => creating inverse index
#### lectures(?) => look at scoring algorithms

# Testing ground

In [26]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)

Used Memory:  466
Free Memory:  373
Total Memory: 839
Max Memory:   3641


[36mmb[0m: [32mInt[0m = [32m1048576[0m
[36mruntime[0m: [32mRuntime[0m = java.lang.Runtime@1e0eed7b