## Import modules

In [1]:
classpath.addPath("tinyir-1.1.jar")



In [2]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [3]:
// import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mjava.io._[0m

In [4]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap}
// enables "mutable lists"
import scala.collection.mutable.ListBuffer  

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mscala.collection.mutable.ListBuffer[0m

## Define classes and functions

In [5]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [15]:
// Create YMParse for customized tokenization

class YMParse(is: InputStream) extends processing.TipsterParse(is) { 
    override def tokens: List[String] = token_filter(content)
    def hname: Int = name.hashCode()
    def htokens: List[Int] = token_filter(content).map(t => t.hashCode())
  //override def ID: String = name - need to fix the ID.
}

defined [32mclass [36mYMParse[0m

In [16]:
import ch.ethz.dal.tinyir.processing.TipsterParse
import ch.ethz.dal.tinyir.processing.Tokenizer
import ch.ethz.dal.tinyir.processing.XMLDocument

// Create YMStream for customized stream using YMParse
class YMStream (path: String, ext: String = "") extends io.ParsedXMLStream(new io.ZipDirStream(path, "")){
  def stream : Stream[YMParse] = unparsed.stream.map(is => new YMParse(is))
  def length = unparsed.length 
}



[32mimport [36mch.ethz.dal.tinyir.processing.TipsterParse[0m
[32mimport [36mch.ethz.dal.tinyir.processing.Tokenizer[0m
[32mimport [36mch.ethz.dal.tinyir.processing.XMLDocument[0m
defined [32mclass [36mYMStream[0m

In [17]:
val collection = new YMStream("resources").stream
val collection2 = new io.TipsterStream("resources").stream

[36mcollection[0m: [32mStream[0m[[32mYMParse[0m] = [33mStream[0m(
  cmd14$$user$YMParse@19d6dbea,
  cmd14$$user$YMParse@4199c280,
  cmd14$$user$YMParse@6fa1e781,
  cmd14$$user$YMParse@41a8f641,
  cmd14$$user$YMParse@58a9dbf7,
  cmd14$$user$YMParse@1b9fd9f9,
  cmd14$$user$YMParse@47999f00,
  cmd14$$user$YMParse@57e8d3f1,
  cmd14$$user$YMParse@ec73470,
  cmd14$$user$YMParse@1fb5a518,
  cmd14$$user$YMParse@7dd6c3cf,
  cmd14$$user$YMParse@2b4bacab,
  cmd14$$user$YMParse@1cbfa1fe,
  cmd14$$user$YMParse@15cc211d,
  cmd14$$user$YMParse@d0bcd5f,
  cmd14$$user$YMParse@17a12e9d,
  cmd14$$user$YMParse@62e786ef,
  cmd14$$user$YMParse@76ea2dd3,
  cmd14$$user$YMParse@16b179ea,
[33m...[0m
[36mcollection2[0m: [32mStream[0m[[32mXMLDocument[0m] = [33mStream[0m(
  ch.ethz.dal.tinyir.processing.TipsterParse@6d324779,
  ch.ethz.dal.tinyir.processing.TipsterParse@3ebfb480,
  ch.ethz.dal.tinyir.processing.TipsterParse@5d504670,
  ch.ethz.dal.tinyir.processing.TipsterParse@61ee3bfc,
  ch.eth

In [27]:
collection(84493).htokens.size

[36mres26[0m: [32mInt[0m = [32m287[0m

In [26]:
collection(84493).tokens.size

[36mres25[0m: [32mInt[0m = [32m287[0m

In [33]:
import ch.ethz.dal.tinyir.processing._
def fwStream (docs: Stream[YMParse]) :
   Stream[(Int,List[Int])]
   = docs.map(d => (d.hname, d.htokens))

[32mimport [36mch.ethz.dal.tinyir.processing._[0m
defined [32mfunction [36mfwStream[0m

In [34]:
fwStream(collection)

[36mres33[0m: [32mStream[0m[([32mInt[0m, [32mList[0m[[32mInt[0m])] = [33mStream[0m(
  [33m[0m(
    [32m-2143258984[0m,
    [33mList[0m(
      [32m94851467[0m,
      [32m3512060[0m,
      [32m-1266285233[0m,
      [32m1490282602[0m,
      [32m103185[0m,
      [32m109548807[0m,
      [32m3045820[0m,
      [32m-1234877242[0m,
      [32m-342511989[0m,
      [32m2998801[0m,
      [32m92905987[0m,
      [32m2998801[0m,
      [32m108511772[0m,
      [32m-1655966997[0m,
      [32m351160793[0m,
      [32m98619021[0m,
[33m...[0m

In [35]:
def fwIndex (docs: Stream[YMParse]) :
   Map[Int,List[Int]] = fwStream(docs).toMap

defined [32mfunction [36mfwIndex[0m

In [None]:
fwIndex(collection.take(50000))

In [None]:
def postings (s: Stream[YMParse]): Stream[(Int,Int)] =
  s.flatMap( d => d.htokens.map(token => (token,d.hname)) )

In [None]:
postings(collection).groupBy(_._1).mapValues(_.map(p => p._2).distinct.sorted)

In [None]:
// Frequency Index

case class TfTuple(term: String, doc: String, count: Int)
def tfTuples (docs: Stream[Document]) : Stream[TfTuple] =
    docs.flatMap( d => d.tokens.groupBy(identity)
    .map{ case (tk,lst) => TfTuple(tk, d.name, lst.length) }
val fqIndex : Map[String,List[(String,Int)]] =
  tfTuples(stream).groupBy(_.term)
  .mapValues(_.map(tfT => (tfT.doc, tfT.count)).sorted)

In [32]:
"b".getBytes

[36mres31[0m: [32mArray[0m[[32mByte[0m] = [33mArray[0m([32m98[0m)

In [None]:
var test = "a" :: "b" :: "c"

In [None]:
val pos_i = new indexing.PosIndex(collection.take(100))

In [None]:
val simple_i = new indexing.SimpleIndex(collection2.take(50000))

In [None]:
collection2(10).ID

In [None]:
simple_i.index

In [None]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)