## Import modules

In [1]:
classpath.addPath("tinyir-1.1.jar")



In [2]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [3]:
// import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mjava.io._[0m

In [4]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap}
// enables "mutable lists"
import scala.collection.mutable.ListBuffer  

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mscala.collection.mutable.ListBuffer[0m

## Define classes and functions

In [5]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty)
}

defined [32mfunction [36mtoken_filter[0m

In [6]:
class xml_doc (file_path: String) {
    def get_doc(): xml.Elem = {
        XML.loadFile(file_path: String)
    }    
    
    def text() = {
        (get_doc() \\ "DOC" \\ "TEXT").text
    }
    
    def head() = {
        (get_doc() \\ "DOC" \\ "HEAD").text
    }

    def id() = {
        (get_doc() \\ "DOC" \\ "DOCNO").text.trim
    }
    
    def tokens() = {
        token_filter(head() ++ text())
    }
}

defined [32mclass [36mxml_doc[0m

In [7]:
def list_docs (path: String) = {  // : Array[java.io.File]
        new java.io.File(path).listFiles.map(x => x.toString())
    }
val numPattern = "[0-9]+".r

defined [32mfunction [36mlist_docs[0m
[36mnumPattern[0m: [32mscala[0m.[32mutil[0m.[32mmatching[0m.[32mRegex[0m = [0-9]+

In [8]:
def write(data: MutMap[Double, (Double, Double, Double)],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).productIterator.toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

def write_prediction(data: MutMap[Int, Set[String]],filename: String) = {

    val bw = new BufferedWriter(new FileWriter(new File(filename)))
    val iter = data.keys.iterator
    while(iter.hasNext){
        var elem = iter.next()
        var values = data(elem).toList
        if(values.length>0){
            bw.write(elem+" "+values.mkString(" "))
            bw.newLine
        }    
    }   
    bw.close()
}

defined [32mfunction [36mwrite[0m
defined [32mfunction [36mwrite_prediction[0m

# Importing data files

In [9]:
val train_list = list_docs("documents")

[36mtrain_list[0m: [32mArray[0m[[32mString[0m] = [33mArray[0m(
  [32m"""
documents\AP880212-0006
  """[0m,
  [32m"""
documents\AP880212-0007
  """[0m,
  [32m"""
documents\AP880212-0009
  """[0m,
  [32m"""
documents\AP880212-0017
  """[0m,
  [32m"""
documents\AP880212-0018
  """[0m,
  [32m"""
documents\AP880212-0022
  """[0m,
  [32m"""
[33m...[0m

In [None]:
val doc_list = new ListBuffer[processing.StringDocument]()
var counter = 0
// for (path <- train_list.take(1000)){
for (path <- train_list){
    var cur_doc = new xml_doc(path)
    doc_list += (new processing.StringDocument(counter, cur_doc.tokens.mkString(" ")))
    counter += 1
    if (counter % 50 == 0) println(s"iteration $counter")
}

iteration 50
iteration 100
iteration 150
iteration 200
iteration 250
iteration 300
iteration 350
iteration 400
iteration 450
iteration 500
iteration 550
iteration 600
iteration 650
iteration 700
iteration 750
iteration 800
iteration 850
iteration 900
iteration 950
iteration 1000
iteration 1050
iteration 1100
iteration 1150
iteration 1200
iteration 1250
iteration 1300
iteration 1350
iteration 1400
iteration 1450
iteration 1500
iteration 1550
iteration 1600
iteration 1650
iteration 1700
iteration 1750
iteration 1800
iteration 1850
iteration 1900
iteration 1950
iteration 2000
iteration 2050
iteration 2100
iteration 2150
iteration 2200
iteration 2250
iteration 2300
iteration 2350
iteration 2400
iteration 2450
iteration 2500
iteration 2550
iteration 2600
iteration 2650
iteration 2700
iteration 2750
iteration 2800
iteration 2850
iteration 2900
iteration 2950
iteration 3000
iteration 3050
iteration 3100
iteration 3150
iteration 3200
iteration 3250
iteration 3300
iteration 3350
iteration 3400


In [None]:
val doc_stream = doc_list.toStream

In [None]:
val test_PosIndex = new indexing.PosIndex(doc_stream)

In [None]:
test_PosIndex.index
// test_pos_index.postings(doc_stream)

In [None]:
test_PosIndex.results("iowa")

In [None]:
test_PosIndex.results("coupl")

In [None]:
indexing.ProxWindow.size

In [None]:
test_PosIndex.results(Seq("iowa", "coupl"))

In [None]:
indexing.ProxWindow.setSize(2)

In [None]:
test_PosIndex.results(Seq("iowa", "withkemp"))

In [None]:
val test_SimpleIndex = new indexing.SimpleIndex(doc_stream)

In [None]:
test_SimpleIndex.index
test_SimpleIndex.index.mapValues(x => x.length)

In [None]:
val test_FreqIndex = new indexing.FreqIndex(doc_stream)

In [None]:
test_FreqIndex.index
test_FreqIndex.results("iowa")

In [None]:
val test_InvertedIndex = indexing.InvertedIndex

#### creating list of files
#### indexing => creating inverse index
#### lectures(?) => look at scoring algorithms

# Testing ground

In [None]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)