## Import modules

In [1]:
classpath.addPath("tinyir-1.1.jar")



In [2]:
import scala.xml.XML
import ch.ethz.dal.tinyir._
import com.github.aztek.porterstemmer.PorterStemmer

[32mimport [36mscala.xml.XML[0m
[32mimport [36mch.ethz.dal.tinyir._[0m
[32mimport [36mcom.github.aztek.porterstemmer.PorterStemmer[0m

In [3]:
// import scala.io.Source  // for importing txt files
import java.io._  // for saving txt files
// import scala.collection.mutable.HashMap  //HashMap used for counting elements in linear time

[32mimport [36mjava.io._[0m

In [4]:
// import scala.util.Random
import scala.collection.mutable.{Map => MutMap}
// enables "mutable lists"
import scala.collection.mutable.ListBuffer  

[32mimport [36mscala.collection.mutable.{Map => MutMap}[0m
[32mimport [36mscala.collection.mutable.ListBuffer[0m

## Define classes and functions

In [5]:
def token_filter(text_body: String) = {
    processing.StopWords.filterOutSW(
        processing.Tokenizer.tokenize(text_body.
                                      replaceAll("\\P{L}+", " "))
    ).
    map(x => PorterStemmer.stem(x)).filter(_.trim.nonEmpty).toList
}

defined [32mfunction [36mtoken_filter[0m

In [6]:
// Create YMParse for customized tokenization

class YMParse(is: InputStream) extends processing.TipsterParse(is) { 
    override def tokens: List[String] = token_filter(content)
    def hname: Int = name.hashCode()
    def htokens: List[Int] = token_filter(content).map(t => t.hashCode())
  //override def ID: String = name - need to fix the ID.
}

defined [32mclass [36mYMParse[0m

In [7]:
import ch.ethz.dal.tinyir.processing.TipsterParse
import ch.ethz.dal.tinyir.processing.Tokenizer
import ch.ethz.dal.tinyir.processing.XMLDocument

// Create YMStream for customized stream using YMParse
class YMStream (path: String, ext: String = "") extends io.ParsedXMLStream(new io.ZipDirStream(path, "")){
  def stream : Stream[YMParse] = unparsed.stream.map(is => new YMParse(is))
  def length = unparsed.length 
}



[32mimport [36mch.ethz.dal.tinyir.processing.TipsterParse[0m
[32mimport [36mch.ethz.dal.tinyir.processing.Tokenizer[0m
[32mimport [36mch.ethz.dal.tinyir.processing.XMLDocument[0m
defined [32mclass [36mYMStream[0m

In [9]:
val collection = new YMStream("resources").stream
val collection2 = new io.TipsterStream("resources").stream

[36mcollection[0m: [32mStream[0m[[32mYMParse[0m] = [33mStream[0m(
  cmd5$$user$YMParse@34530c36,
  cmd5$$user$YMParse@2398123c,
  cmd5$$user$YMParse@64cae4d8,
  cmd5$$user$YMParse@3e78ec13,
  cmd5$$user$YMParse@3de035b6,
  cmd5$$user$YMParse@1955077a,
  cmd5$$user$YMParse@45e2eeba,
  cmd5$$user$YMParse@5c2e5c04,
  cmd5$$user$YMParse@29e1cde3,
  cmd5$$user$YMParse@6f0443a1,
  cmd5$$user$YMParse@4b0d621a,
  cmd5$$user$YMParse@4eb6856a,
  cmd5$$user$YMParse@7be0e7dd,
  cmd5$$user$YMParse@1f8c82aa,
  cmd5$$user$YMParse@f7a8b7a,
  cmd5$$user$YMParse@5b20df9d,
  cmd5$$user$YMParse@debc803,
  cmd5$$user$YMParse@7793579d,
  cmd5$$user$YMParse@3f4c0e28,
[33m...[0m
[36mcollection2[0m: [32mStream[0m[[32mXMLDocument[0m] = [33mStream[0m(
  ch.ethz.dal.tinyir.processing.TipsterParse@4ccdac19,
  ch.ethz.dal.tinyir.processing.TipsterParse@6bdbbabe,
  ch.ethz.dal.tinyir.processing.TipsterParse@3bc0b731,
  ch.ethz.dal.tinyir.processing.TipsterParse@17ad9b0e,
  ch.ethz.dal.tinyir.proces

In [None]:
import ch.ethz.dal.tinyir.processing._
def fwStream (docs: Stream[YMParse]) :
   Stream[(Int,List[Int])]
   = docs.map(d => (d.hname, d.htokens))

In [None]:
def fwIndex (docs: Stream[YMParse]) :
   Map[Int,List[Int]] = fwStream(docs).toMap

In [8]:
// hashed version
def postings (s: Stream[YMParse]): Stream[(Int,Int)] =
  s.flatMap( d => d.htokens.map(token => (token,d.hname)) )
var invIndex = scala.collection.mutable.Map[Int, List[Int]]()

defined [32mfunction [36mpostings[0m
[36minvIndex[0m: [32mcollection[0m.[32mmutable[0m.[32mMap[0m[[32mInt[0m, [32mList[0m[[32mInt[0m]] = [33mMap[0m()

In [None]:
// unhashed verison
def postings (s: Stream[YMParse]): Stream[(String,String)] =
  s.flatMap( d => d.tokens.map(token => (token,d.name)) )
var invIndex = scala.collection.mutable.Map[String, List[String]]()

In [None]:
val posting1 = postings(collection.take(100)).groupBy(_._1).mapValues(_.map(p => p._2).distinct.sorted)

In [None]:
// create twenty sets from original collection (or another number)
// create posting for each of these sets
// then call inverted index dictionary. for each token, call get or else = empty list

In [None]:
for( a <- (5000 to 100000 by 5000))
{
    var imd_posting = postings(collection.slice(a-5000,a)).groupBy(_._1).mapValues(_.map(p => p._2).distinct.sorted)
    for (key <- imd_posting.keys){
        invIndex(key) = invIndex.getOrElse(key,List()).++(imd_posting(key))
    }
    print(a)
    //invIndex = (invIndex.toSeq ++ imd_posting.toSeq)
    //.groupBy{case(token,id) => token}
    //.mapValues(item => item.flatMap{case(token,id) => id}.toList)
}

500010000

In [None]:
for( a <- (5000 to 100000 by 5000))
{
    var imd_posting = postings(collection.slice(a-5000,a)).groupBy(_._1).mapValues(_.map(p => p._2).distinct.sorted)
    for (key <- imd_posting.keys){
        invIndex(key) = invIndex.getOrElse(key,List()).++(imd_posting(key))
    }
    print(a)
    //invIndex = (invIndex.toSeq ++ imd_posting.toSeq)
    //.groupBy{case(token,id) => token}
    //.mapValues(item => item.flatMap{case(token,id) => id}.toList)
}

In [None]:
invIndex.values.flatten.toList.

In [None]:
postings(collection.take(1000)).groupBy(_._1).mapValues(_.map(p => p._2).distinct.sorted)

In [None]:
val mergedMap2 = (imd_posting1.toSeq ++ imd_posting2.toSeq)
  .groupBy{case(token,id) => token}
  .mapValues(item => item.flatMap{ case(token,id) => id}.toList)

In [None]:
mergedMap3

In [None]:
mergedMap2

In [None]:
// Frequency Index

case class TfTuple(term: String, doc: String, count: Int)
def tfTuples (docs: Stream[Document]) : Stream[TfTuple] =
    docs.flatMap( d => d.tokens.groupBy(identity)
    .map{ case (tk,lst) => TfTuple(tk, d.name, lst.length) }
val fqIndex : Map[String,List[(String,Int)]] =
  tfTuples(stream).groupBy(_.term)
  .mapValues(_.map(tfT => (tfT.doc, tfT.count)).sorted)

In [None]:
"b".getBytes

In [None]:
var test = "a" :: "b" :: "c"

In [None]:
classpath.add("org.scalaz %% scalaz-core % 7.0.6")

In [None]:
classpath.add("org.scalaz" %% "scalaz-core" % "7.0.6")



In [None]:
import scala.collection.immutable.IntMap 

In [None]:
import scalaz.Scalaz._
//each map contains count of people names
 val names = Map("Sidney" -> 1, "Paul" -> 1, "Jacob" -> 7)
 val moreNames = Map("Sidney" -> 1, "Paul" -> 5, "Nick" -> 2)
 //Money time!
 val mergedMap4 = names.map(p => p._1 -> List(p._2)) |+| moreNames.map(p => p._1 -> List(p._2))

In [None]:
 //each map contains count of people names
 val names = Map("Sidney" -> 1, "Paul" -> 1, "Jacob" -> 7)
 val moreNames = Map("Sidney" -> 1, "Paul" -> 5, "Nick" -> 2)

In [None]:
val mergedMap2 = (names.toSeq ++ moreNames.toSeq)
  .groupBy{case(name,amount) => name}
  .mapValues(person => person.map{ case(name,amount) => amount}.toList)

In [None]:
val pos_i = new indexing.PosIndex(collection.take(100))

In [None]:
val simple_i = new indexing.SimpleIndex(collection2.take(50000))

In [None]:
collection2(10).ID

In [None]:
simple_i.index

In [None]:
val mb = 1024*1024
val runtime = Runtime.getRuntime
println(s"Used Memory:  " + (runtime.totalMemory - runtime.freeMemory) / mb)
println(s"Free Memory:  " + runtime.freeMemory / mb)
println(s"Total Memory: " + runtime.totalMemory / mb)
println(s"Max Memory:   " + runtime.maxMemory / mb)