##Install Pyterrier library

In [88]:
#install the Pyterrier framework
!pip install python-terrier
import pyterrier as pt

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

if not pt.started():
  pt.init()



  if not pt.started():


## Import Dataframe


In [89]:
import pandas as pd
#to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)

docs_df = pd.DataFrame([ ['d0', 'The sun sets in the west, painting the sky with hues of orange and pink filled.'],
                        ['d1', 'Birds chirped merrily in the lush green forest, creating a symphony of nature.'],
                        ['d2', 'The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.'],
                        ['d3', 'Waves crashed against the rocky shore, sending sprays of salty water into the air.'],
                        ['d4', 'Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.'] ],
                        columns=["docno", "raw_text"])

## Data Cleaning

Before indexing our data we need to do the following processing steps:


1.   **Remove stopwords.**
2.   **Normalization.**
3.   **Stemming.**


In [90]:
# Cleaning functions for English

import re
from snowballstemmer import stemmer
from sklearn.feature_extraction import _stop_words as stp


In [91]:
stp.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [92]:
len(stp.ENGLISH_STOP_WORDS)

318

In [93]:
#removing Stop Words function
def remove_stopWords(sentence):
    terms=[]
    stopWords= set(stp.ENGLISH_STOP_WORDS)
    for term in sentence.split() :
        if term not in stopWords :
           terms.append(term)
    return " ".join(terms)




def normalize(text):
  lower_string = text.lower()
  print("the text in lower case: ", lower_string)
  # Remove punctuation and numbers
  cleaned_string = re.sub(r'[^a-zA-Z\s]', '', lower_string)
  print(" the text cleaned: ", cleaned_string)
  normalized_string = ' '.join(cleaned_string.split())
  return(normalized_string)


#specify that we want to stem arabic text

stemmerObj = stemmer("english")  # Use "english" or another supported language

#define the stemming function
def stem(sentence):
    return " ".join([stemmerObj.stemWord(i) for i in sentence.split()])


################################################## test the functions ######################################

docs_df['text_STEM']=docs_df['raw_text'].apply(stem)

docs_df["NOR_text"]=docs_df["raw_text"].apply(normalize)

docs_df["nostop_text"]=docs_df["raw_text"].apply(remove_stopWords)


print("***************************************************************************documents after stemming, normalizing, removing stopwords*********************************************************************")
display(docs_df)


the text in lower case:  the sun sets in the west, painting the sky with hues of orange and pink filled.
 the text cleaned:  the sun sets in the west painting the sky with hues of orange and pink filled
the text in lower case:  birds chirped merrily in the lush green forest, creating a symphony of nature.
 the text cleaned:  birds chirped merrily in the lush green forest creating a symphony of nature
the text in lower case:  the aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.
 the text cleaned:  the aroma of freshly brewed coffee filled the cozy cafe welcoming patrons with its warmth
the text in lower case:  waves crashed against the rocky shore, sending sprays of salty water into the air.
 the text cleaned:  waves crashed against the rocky shore sending sprays of salty water into the air
the text in lower case:  laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.
 the text cleaned: 

Unnamed: 0,docno,raw_text,text_STEM,NOR_text,nostop_text
0,d0,"The sun sets in the west, painting the sky with hues of orange and pink filled.","The sun set in the west, paint the sky with hue of orang and pink filled.",the sun sets in the west painting the sky with hues of orange and pink filled,"The sun sets west, painting sky hues orange pink filled."
1,d1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.","Bird chirp merrili in the lush green forest, creat a symphoni of nature.",birds chirped merrily in the lush green forest creating a symphony of nature,"Birds chirped merrily lush green forest, creating symphony nature."
2,d2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.","The aroma of fresh brew coffe fill the cozi cafe, welcom patron with it warmth.",the aroma of freshly brewed coffee filled the cozy cafe welcoming patrons with its warmth,"The aroma freshly brewed coffee filled cozy cafe, welcoming patrons warmth."
3,d3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.","Wave crash against the rocki shore, send spray of salti water into the air.",waves crashed against the rocky shore sending sprays of salty water into the air,"Waves crashed rocky shore, sending sprays salty water air."
4,d4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,Laughter echo through the hall as friend gather around a crackl bonfir on a starri night paint filled.,laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled,Laughter echoed halls friends gathered crackling bonfire starry night painting filled.


In [94]:
docs_df['text_NS'] = docs_df["raw_text"].apply(remove_stopWords)

docs_df["NOR_text"] = docs_df["text_NS"].apply(normalize)

docs_df["text"] = docs_df['NOR_text'].apply(stem)

display(docs_df[['raw_text','text']])

the text in lower case:  the sun sets west, painting sky hues orange pink filled.
 the text cleaned:  the sun sets west painting sky hues orange pink filled
the text in lower case:  birds chirped merrily lush green forest, creating symphony nature.
 the text cleaned:  birds chirped merrily lush green forest creating symphony nature
the text in lower case:  the aroma freshly brewed coffee filled cozy cafe, welcoming patrons warmth.
 the text cleaned:  the aroma freshly brewed coffee filled cozy cafe welcoming patrons warmth
the text in lower case:  waves crashed rocky shore, sending sprays salty water air.
 the text cleaned:  waves crashed rocky shore sending sprays salty water air
the text in lower case:  laughter echoed halls friends gathered crackling bonfire starry night painting filled.
 the text cleaned:  laughter echoed halls friends gathered crackling bonfire starry night painting filled


Unnamed: 0,raw_text,text
0,"The sun sets in the west, painting the sky with hues of orange and pink filled.",the sun set west paint sky hue orang pink fill
1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.",bird chirp merrili lush green forest creat symphoni natur
2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.",the aroma fresh brew coffe fill cozi cafe welcom patron warmth
3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.",wave crash rocki shore send spray salti water air
4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,laughter echo hall friend gather crackl bonfir starri night paint fill


In [95]:
docs_df['text_NS'] = docs_df["raw_text"].apply(normalize)

docs_df["NOR_text"] = docs_df["text_NS"].apply(remove_stopWords)

docs_df["text"] = docs_df['NOR_text'].apply(stem)

display(docs_df[['raw_text','text']])

the text in lower case:  the sun sets in the west, painting the sky with hues of orange and pink filled.
 the text cleaned:  the sun sets in the west painting the sky with hues of orange and pink filled
the text in lower case:  birds chirped merrily in the lush green forest, creating a symphony of nature.
 the text cleaned:  birds chirped merrily in the lush green forest creating a symphony of nature
the text in lower case:  the aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.
 the text cleaned:  the aroma of freshly brewed coffee filled the cozy cafe welcoming patrons with its warmth
the text in lower case:  waves crashed against the rocky shore, sending sprays of salty water into the air.
 the text cleaned:  waves crashed against the rocky shore sending sprays of salty water into the air
the text in lower case:  laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.
 the text cleaned: 

Unnamed: 0,raw_text,text
0,"The sun sets in the west, painting the sky with hues of orange and pink filled.",sun set west paint sky hue orang pink fill
1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.",bird chirp merrili lush green forest creat symphoni natur
2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.",aroma fresh brew coffe fill cozi cafe welcom patron warmth
3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.",wave crash rocki shore send spray salti water air
4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,laughter echo hall friend gather crackl bonfir starri night paint fill


## Build Index

In [96]:

indexer = pt.DFIndexer("./myFirstIndex",  IndexingType= 2 , blocks= True, overwrite=True)

# index the text, record the docnos as metadata
index_ref = indexer.index(docs_df["text"], docs_df["docno"])
index_ref.toString()

  indexer = pt.DFIndexer("./myFirstIndex",  IndexingType= 2 , blocks= True, overwrite=True)


'./myFirstIndex/data.properties'

### **Explore the index**
An index has several data structures:

*    **the CollectionStatistics**- the salient global statistics of the index.
*    **the Lexicon** - the vocabulary of the index, including statistics of the terms, and a pointer into the inverted index.

* **the inverted index (a PostingIndex**) - contains the posting list for each term, detailing the frequency in which aterm appears in that document .
* **the DocumentIndex** - contains the length of the document (and other field lengths).  
* **the MetaIndex** - contains document metadata, such as the docno, and optionally the raw text and the URL ofeach document.
* **the direct index (also a PostingIndex)** - contains a posting list for each document, detailing which terms occuringthat document and which frequency. The presence of the direct index depends on the IndexingType that has beenapplied - single-pass and some memory indices do not provide a direct index.

Let's check the files the index files created.

In [97]:
!ls -lh myFirstIndex/

total 44K
-rw-r--r-- 1 root root   57 Sep 30 17:20 data.direct.bf
-rw-r--r-- 1 root root   85 Sep 30 17:20 data.document.fsarrayfile
-rw-r--r-- 1 root root   66 Sep 30 17:20 data.inverted.bf
-rw-r--r-- 1 root root 3.7K Sep 30 17:20 data.lexicon.fsomapfile
-rw-r--r-- 1 root root  513 Sep 30 17:20 data.lexicon.fsomaphash
-rw-r--r-- 1 root root  176 Sep 30 17:20 data.lexicon.fsomapid
-rw-r--r-- 1 root root   55 Sep 30 17:20 data.meta-0.fsomapfile
-rw-r--r-- 1 root root   40 Sep 30 17:20 data.meta.idx
-rw-r--r-- 1 root root   80 Sep 30 17:20 data.meta.zdata
-rw-r--r-- 1 root root 4.1K Sep 30 17:20 data.properties


Let's check the statistics about the index we created.

In [98]:
print(index_ref.toString())
#we will first load the index
index = pt.IndexFactory.of(index_ref)
#we will call getCollectionStatistics() to check the stats
print(index.getCollectionStatistics().toString())

./myFirstIndex/data.properties
Number of documents: 5
Number of terms: 44
Number of postings: 47
Number of fields: 0
Number of tokens: 47
Field names: []
Positions:   true



We can check the lexicon which is the **vocabulary** of the collection.

* Nt is the number of unique documents that each term occurs in.
* TF is the total number of occurrences – some weighting models use this instead of Nt.
* The numbers in the @{} are a pointer – they tell Terrier where the postings are for that term in the inverted index data structure.


In [99]:
for kv in index.getLexicon():
  print("%s -> %s " % (kv.getKey(), kv.getValue().toString()))


air -> term28 Nt=1 TF=1 maxTF=1 @{0 0 0} 
aroma -> term22 Nt=1 TF=1 maxTF=1 @{0 1 7} 
bird -> term15 Nt=1 TF=1 maxTF=1 @{0 2 6} 
bonfir -> term36 Nt=1 TF=1 maxTF=1 @{0 3 5} 
brew -> term26 Nt=1 TF=1 maxTF=1 @{0 5 2} 
cafe -> term18 Nt=1 TF=1 maxTF=1 @{0 6 3} 
chirp -> term11 Nt=1 TF=1 maxTF=1 @{0 7 6} 
coff -> term23 Nt=1 TF=1 maxTF=1 @{0 8 7} 
cozi -> term20 Nt=1 TF=1 maxTF=1 @{0 10 2} 
crackl -> term35 Nt=1 TF=1 maxTF=1 @{0 11 5} 
crash -> term33 Nt=1 TF=1 maxTF=1 @{0 13 2} 
creat -> term14 Nt=1 TF=1 maxTF=1 @{0 14 5} 
echo -> term37 Nt=1 TF=1 maxTF=1 @{0 16 0} 
fill -> term2 Nt=3 TF=3 maxTF=1 @{0 17 3} 
forest -> term16 Nt=1 TF=1 maxTF=1 @{0 21 6} 
fresh -> term21 Nt=1 TF=1 maxTF=1 @{0 23 1} 
friend -> term38 Nt=1 TF=1 maxTF=1 @{0 24 2} 
gather -> term39 Nt=1 TF=1 maxTF=1 @{0 25 7} 
green -> term9 Nt=1 TF=1 maxTF=1 @{0 27 4} 
hall -> term42 Nt=1 TF=1 maxTF=1 @{0 28 7} 
hue -> term8 Nt=1 TF=1 maxTF=1 @{0 30 2} 
laughter -> term43 Nt=1 TF=1 maxTF=1 @{0 31 3} 
lush -> term12 Nt=1 TF=1 

we can also lookup a term in PyTerrier's lexicon:

In [100]:
index.getLexicon()["brew"].toString()

'term26 Nt=1 TF=1 maxTF=1 @{0 5 2}'

**The inverted index** tells us in which documents each term occurs in.
The LexiconEntry is the pointer that tell us where to find the postings for that term in the inverted index.

Let's look in which documents the word "brew" occurs and its frequency in each document.

**Note:** we need to preprocess each search term with the same preprocessing steps we performed on the collection.

In [101]:
#preprocess the search term
term="PainTing"
print("the term before normalization and stemming:", term)
#normalize the word
term = normalize(term)
#stem the word
term = remove_stopWords(term)
term = stem(term)


print("the term after normalization and stemming:", term)
#search the term
try:
 pointer = index.getLexicon()[term]
 for posting in index.getInvertedIndex().getPostings(pointer):
    print(posting.toString() + " doclen=%d" % posting.getDocumentLength())
except:
    print("term %s not found"%term)

the term before normalization and stemming: PainTing
the text in lower case:  painting
 the text cleaned:  painting
the term after normalization and stemming: paint
(0,1,B[3]) doclen=9
(4,1,B[9]) doclen=11


How many documents does term "brew" occur in?

In [102]:
index.getLexicon()[term].getDocumentFrequency()

2

In [103]:
index.getLexicon()[term].getFrequency()

2

What terms occur in the 4th document?

In [104]:
di = index.getDirectIndex()
doi = index.getDocumentIndex()
lex = index.getLexicon()
docid = 3 #docids are 0-based #note: postings will be null if the document is empty
for posting in di.getPostings(doi.getDocumentEntry(docid)):
    termid = posting.getId()
    lee = lex.getLexiconEntry(termid)
    print("%s with frequency %d" % (lee.getKey(),posting.getFrequency()))


rocki with frequency 1
air with frequency 1
water with frequency 1
shore with frequency 1
salti with frequency 1
sprai with frequency 1
crash with frequency 1
wave with frequency 1


### **Exercise 1**
How can we update our index to include the positions of the terms in the index? Hint: you can use [PyTerrier documentation](https://pyterrier.readthedocs.io/_/downloads/en/latest/pdf/) as a reference.

In [105]:
indexer = pt.DFIndexer("./myFirstIndex",  IndexingType= 2, blocks= True, overwrite=True)

# index the text, record the docnos as metadata
index_ref = indexer.index(docs_df["text"], docs_df["docno"])
index_ref.toString()

  indexer = pt.DFIndexer("./myFirstIndex",  IndexingType= 2, blocks= True, overwrite=True)


'./myFirstIndex/data.properties'

### **Exercise 2**
Retrieve the postings for the term "filled" and specify the position of the term in each posting.

In [134]:
#preprocess the search term
term="filled"
print("the term before normalization and stemming:", term)
#normalize the word
term= normalize(term)
#stem the word
term=remove_stopWords(term)
term=stem(term)


print("the term after normalization and stemming:", term)
#search the term
try:
 pointer = index.getLexicon()[term]
 for posting in index.getInvertedIndex().getPostings(pointer):
    x= posting.toString()
    # Find the index of "["
    start_pos = x.find("[")
    end_pos = x.find("]")
    # Extract the number between '[' and ']'
    number = x[start_pos + 1:end_pos]
    print("the term position is (start count from 0):", number," in the docID = %d" % posting.getId())
    print(" where the doc len is %d "% posting.getDocumentLength())
except:
    print("term %s not found"%term)

the term before normalization and stemming: filled
the text in lower case:  filled
 the text cleaned:  filled
the term after normalization and stemming: fill
the term position is (start count from 0): 8  in the docID = 0
 where the doc len is 9 
the term position is (start count from 0): 4  in the docID = 2
 where the doc len is 10 
the term position is (start count from 0): 10  in the docID = 4
 where the doc len is 11 


In [108]:
display(docs_df["text"])

Unnamed: 0,text
0,sun set west paint sky hue orang pink fill
1,bird chirp merrili lush green forest creat symphoni natur
2,aroma fresh brew coffe fill cozi cafe welcom patron warmth
3,wave crash rocki shore send spray salti water air
4,laughter echo hall friend gather crackl bonfir starri night paint fill
