### Load Libraries  

In [1]:
#install the Pyterrier framework
!pip install python-terrier



In [2]:
import pyterrier as pt
if not pt.started():
  pt.init()

  if not pt.started():
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [3]:
#install the Arabic stop words library
!pip install Arabic-Stopwords



In [4]:
#we need to import the following libraries.
import pandas as pd
#to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)
import numpy as np
import re
from snowballstemmer import stemmer
from tqdm import tqdm
import arabicstopwords.arabicstopwords as stp

### Data Cleaning

In [5]:
######################### removing Stop Words function ####################################

def remove_stop_words(sentence):
    terms=[]
    stopWords= set(stp.stopwords_list())
    for term in sentence.split() :
        if term not in stopWords :
           terms.append(term)
    return " ".join(terms)

############################# normalize function ###########################################

def normalize(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

############################# stemming function ############################################

ar_stemmer = stemmer("arabic")
def stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


############################# perform first group of preprocessing steps ###############################


def preprocess1(sentence):
  sentence =remove_stop_words(sentence)
  sentence =normalize(sentence)
  return sentence


############################# normalize_tweets function ############################################


def normalize_tweets(text):
   if text is np.nan:
    return ""
   else:
    text = re.sub(r"http\S+", " ", text) # remove urls
    text = re.sub(r"RT ", " ", text) # remove rt
    text = re.sub(r"@[\w]*", " ", text) # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r'\t', ' ', text) # remove tabs
    text = re.sub(r'\n', ' ', text) # remove line jump
    text = re.sub(r"\s+", " ", text) # remove extra white space
    accents = re.compile(r'[\u064b-\u0652\u0640]') # harakaat and tatweel (kashida) to remove

    arabic_punc= re.compile(r'[\u0621-\u063A\u0641-\u064A\d+]+') # Keep only Arabic letters/do not remove numbers
    text=' '.join(arabic_punc.findall(accents.sub('',text)))
    text = text.strip()
    return text

############################# perform Second group of preprocessing steps ###############################

def preprocess2(sentence):
  sentence =remove_stop_words(sentence)
  sentence =normalize_tweets(sentence)
  return sentence

### Load Dataset

In [6]:

dataset_links=["https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-01.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-02.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-03.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-04.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-05.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-06.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-07.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-08.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-09.txt",
               "https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/tweets/evetar-q-10.txt"]

full_data=pd.DataFrame()
for i in tqdm(range(len(dataset_links))):
    tweets=pd.read_csv(dataset_links[i], sep='\t')
    full_data=pd.concat([full_data,tweets],ignore_index=True)
full_data.reset_index(inplace=True,drop=True)

#the docno will be our tweetID
full_data["docno"]=full_data["tweetID"].astype(str)


100%|██████████| 10/10 [00:02<00:00,  4.71it/s]


In [7]:
# preparing the data for index
full_data['Cleaned_text']=full_data['tweetText'].apply(preprocess2)

full_data

Unnamed: 0,tweetID,tweetText,docno,Cleaned_text
0,549679192804061184,"الاعدام لعامل مطعم قتل زميله طعناً في ""البيادر"" أيدت محكمة التمييز الحكم الصادر عن محكمة الجنايات الكبرى والقاضي... http://t.co/H0txdjv3Kn",549679192804061184,الاعدام لعامل مطعم قتل زميله طعنا البيادر أيدت محكمة التمييز الحكم الصادر محكمة الجنايات الكبرى والقاضي
1,549699343666532352,#الأخبار ▪ تأجيل محاكمة 7 إرهابيين بسبب غياب الدفاع: أجلت محكمة الجنايات بالعاصمة إلى تاريخ لاحق محاكمة سبعة إ... http://t.co/GM4jmpAWbR,549699343666532352,الأخبار تأجيل محاكمة 7 إرهابيين بسبب غياب الدفاع أجلت محكمة الجنايات بالعاصمة تاريخ لاحق محاكمة سبعة إ
2,549711593487888387,@helale9999 عشآن أعطيتك وحده صميم صرت ترمي أعذار ...حقق العالميةة و أرجع كلمني يَ الأياب الانتحاري,549711593487888387,عشآن أعطيتك وحده صميم صرت ترمي أعذار حقق العالميةة أرجع كلمني ي الأياب الانتحاري
3,549719610459967488,#النهدي ثمانية قتلى في تفجير انتحاري بسيارة مفخخة أمام معملين للغاز في ريف حمص - شبكة الصين http://t.co/r5zFEuzAPu,549719610459967488,النهدي ثمانية قتلى تفجير انتحاري بسيارة مفخخة معملين للغاز ريف حمص شبكة الصين
4,549720880717508608,البحرين: ضبط مطلوبين متورطين في التفجير بالعكر الشرقي بقية الموضوع اضغط هنا http://t.co/t4A5bNrqyh,549720880717508608,البحرين ضبط مطلوبين متورطين التفجير بالعكر الشرقي بقية الموضوع اضغط
...,...,...,...,...
49995,561985373048299520,مواسيا الشعب السعودي..حاكم دبي يبدأ جلسة مجلس الوزراء بقراءة الفاتحة على الملك عبدالله #الخبر #السعودية #saudi #ksa,561985373048299520,مواسيا الشعب السعودي حاكم دبي يبدأ جلسة مجلس الوزراء بقراءة الفاتحة الملك عبدالله الخبر السعودية
49996,561987332878766081,@al_shalal @F_D_A82 تم تفجير صماخنا,561987332878766081,تم تفجير صماخنا
49997,561988825186971650,@aubyazid123 جزاك الله ألف خير ❌ جزاك الله خير ✔️ - كلمة ألف فيها تحجير لخير الله.,561988825186971650,جزاك الله ألف خير جزاك الله خير كلمة ألف تحجير لخير الله
49998,561991173360091136,كيف نفّذت «النصرة» عمليّة تفجير الحافلة اللبنانية في دمشق؟ http://t.co/TEmP1Dso1v,561991173360091136,نفذت النصرة عملية تفجير الحافلة اللبنانية دمشق


load queries (topics titles) that are already defined and released with EveTAR dataset

In [8]:
#read the topics file from Github and use the titles as queries
topics=pd.read_csv("https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/topics.txt", sep='\t',names=['data'])
queries=[]
qid=[]
#we will get the queries and their ids from the topics file
for i in range(len(topics)):
    splitted=topics["data"][i:i+1][i].split(" ")
    if splitted[0]=="<title>":
       queries.append(' '.join(splitted[1:]))
    if splitted[0]=="<num>":
       qid.append(splitted[2])

queriesDF=pd.DataFrame()
queriesDF["qid"]=qid
queriesDF["query"]=queries

queriesDF

Unnamed: 0,qid,query
0,E01,تفجير انتحاري في اب
1,E02,ليتوانيا تستخدم اليورو بدل الليتاس
2,E03,فلسطين تطلب الانضمام للمحكمة الجنائية الدولية
3,E04,وفاة أبو أنس الليبي في نيويورك
4,E05,اختراق كوريا الشمالية حسابات سوني
5,E06,بناء أول كنيسة في إسطنبول منذ قرن
6,E07,باباندريو يؤسس حزب جديد
7,E08,بوكو حرام تخطف شباب في نيجريا
8,E09,سيطرة بوكو حرام على قاعدة عسكرية في نيجيريا
9,E10,مقتل مسلحين في غارات في باكستان


In [9]:
# apply the same prepration steps on the queries
queriesDF["query"]=queriesDF["query"].apply(preprocess2)
queriesDF

Unnamed: 0,qid,query
0,E01,تفجير انتحاري اب
1,E02,ليتوانيا تستخدم اليورو بدل الليتاس
2,E03,فلسطين تطلب الانضمام للمحكمة الجنائية الدولية
3,E04,وفاة أنس الليبي نيويورك
4,E05,اختراق كوريا الشمالية حسابات سوني
5,E06,بناء كنيسة إسطنبول قرن
6,E07,باباندريو يؤسس حزب جديد
7,E08,بوكو حرام تخطف شباب نيجريا
8,E09,سيطرة بوكو حرام قاعدة عسكرية نيجيريا
9,E10,مقتل مسلحين غارات باكستان


### Create index

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
indexer = pt.DFIndexer("/content/drive/MyDrive/myFirstIndex", overwrite=True)
indexer.setProperty("tokeniser", "UTFTokeniser")

index_ref = indexer.index(full_data['Cleaned_text'], full_data["docno"])
index_ref.toString()

  indexer = pt.DFIndexer("/content/drive/MyDrive/myFirstIndex", overwrite=True)


20:09:49.399 [main] WARN org.terrier.structures.indexing.Indexer -- Indexed 40660 empty documents


'/content/drive/MyDrive/myFirstIndex/data.properties'

In [13]:
# we can use this command to load the index after creation on our drive
#index_ref = pt.IndexRef.of("/content/drive/MyDrive/myFirstIndex/data.properties")

index = pt.IndexFactory.of(index_ref)

### Retrieval Models


In [14]:
#set up our retieval model by specifing Hiemstra Language Model as wmodel and limiting the number of results for each query top 100 documents

JM_retr = pt.BatchRetrieve(index,wmodel="Hiemstra_LM",num_results=1000)


# set up Lambda to 0.95

JM_retr_highLambda = pt.BatchRetrieve(index,wmodel="Hiemstra_LM",controls ={"c":0.95},num_results=1000)


#set up our retieval model by specifing TF_IDF as wmodel and limiting the number of retrieved results for each query top 100 documents

tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=1000)


#specify BM25 as wmodel
bm25_retr = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"},num_results=1000)




  JM_retr = pt.BatchRetrieve(index,wmodel="Hiemstra_LM",num_results=1000)
  JM_retr_highLambda = pt.BatchRetrieve(index,wmodel="Hiemstra_LM",controls ={"c":0.95},num_results=1000)
  tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=1000)
  bm25_retr = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"},num_results=1000)


### Testing


In [15]:
#RetrIEve using the Jelinek-Mercer smoothing where lambda=0.15 (default)
JM_res=JM_retr.transform(queriesDF)


#RetrIEve using the Jelinek-Mercer smoothing where lambda= 0.95 (default)
JM_res_Lambda=JM_retr_highLambda.transform(queriesDF)
JM_res_Lambda[:3]


Unnamed: 0,qid,docid,docno,rank,score,query
0,E39,127,550045141746987008,0,3.711443,فوز رونالدو بجائزة الكرة الذهبية 2014
1,E39,213,550202293598294016,1,3.711443,فوز رونالدو بجائزة الكرة الذهبية 2014
2,E39,943,550421966239170560,2,3.711443,فوز رونالدو بجائزة الكرة الذهبية 2014


In [16]:
type(JM_res_Lambda.at[0,'qid'])

str

In [17]:
#the queries dataframe should have qid and query columns
tfidf_res=tfidf_retr.transform(queriesDF)
tfidf_res[:10]

Unnamed: 0,qid,docid,docno,rank,score,query
0,E39,994,550439545091543040,0,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
1,E39,6364,551738982724157440,1,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
2,E39,26204,554715041652436992,2,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
3,E39,26731,554789416011857920,3,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
4,E39,26897,554858240844918784,4,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
5,E39,28000,555027643918082049,5,2.235234,فوز رونالدو بجائزة الكرة الذهبية 2014
6,E39,127,550045141746987008,6,2.159215,فوز رونالدو بجائزة الكرة الذهبية 2014
7,E39,213,550202293598294016,7,2.159215,فوز رونالدو بجائزة الكرة الذهبية 2014
8,E39,943,550421966239170560,8,2.159215,فوز رونالدو بجائزة الكرة الذهبية 2014
9,E39,951,550423736533671936,9,2.159215,فوز رونالدو بجائزة الكرة الذهبية 2014


In [18]:

#the queries dataframe should have qid and query columns
bm25_res=bm25_retr.transform(queriesDF)
bm25_res[:10]

Unnamed: 0,qid,docid,docno,rank,score,query
0,E39,994,550439545091543040,0,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
1,E39,6364,551738982724157440,1,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
2,E39,26204,554715041652436992,2,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
3,E39,26731,554789416011857920,3,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
4,E39,26897,554858240844918784,4,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
5,E39,28000,555027643918082049,5,4.090013,فوز رونالدو بجائزة الكرة الذهبية 2014
6,E39,127,550045141746987008,6,3.950914,فوز رونالدو بجائزة الكرة الذهبية 2014
7,E39,213,550202293598294016,7,3.950914,فوز رونالدو بجائزة الكرة الذهبية 2014
8,E39,943,550421966239170560,8,3.950914,فوز رونالدو بجائزة الكرة الذهبية 2014
9,E39,951,550423736533671936,9,3.950914,فوز رونالدو بجائزة الكرة الذهبية 2014


### **Evaluating our results**
To evaluate the results we need qrels (relevance judgements). The qrels should be in [TREC format](https://trec.nist.gov/).

In [19]:
qrels=pd.read_csv("https://raw.githubusercontent.com/telsayed/IR-in-Arabic/master/Summer2021/data/EveTAR/qrels.txt", sep='\t',names=['qid','Q0','docno','label'])
qrels['docno']=qrels['docno'].astype(str)
qrels['qid']=qrels['qid'].astype(str)

# qrels are in TREC format
#qrels = qrels[qrels["docno"].isin(full_data["docno"].tolist())] # to choose qrels for the chosen 50k documents
qrels[:10]

Unnamed: 0,qid,Q0,docno,label
0,E01,Q0,549711593487888387,0
1,E01,Q0,549719610459967488,0
2,E01,Q0,549720880717508608,0
3,E01,Q0,549724286575861761,0
4,E01,Q0,549780004230283264,0
5,E01,Q0,549846940284764161,0
6,E01,Q0,549879792745476096,0
7,E01,Q0,549880138033135617,0
8,E01,Q0,549885694814863360,0
9,E01,Q0,549887667492503554,0


In [20]:
from pyterrier.measures import *


# Here, we are evaluating TF_IDF retrieval model
eval = pt.Utils.evaluate(tfidf_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])
eval

  eval = pt.Utils.evaluate(tfidf_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])


{'P@1': 0.02,
 'P@10': 0.012,
 'nDCG@5': 0.014454531452899037,
 'R@20': 0.00087248322147651,
 'R@1000': 0.007986577181208054}

In [21]:
evalـJM_res_Lambda = pt.Utils.evaluate(JM_res_Lambda,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])

evalـJM_res_Lambda

  evalـJM_res_Lambda = pt.Utils.evaluate(JM_res_Lambda,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])


{'P@1': 0.0,
 'P@10': 0.016,
 'nDCG@5': 0.013216795894527678,
 'R@20': 0.0009395973154362416,
 'R@1000': 0.007986577181208054}

In [22]:
evalـJM_res = pt.Utils.evaluate(JM_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])

evalـJM_res

  evalـJM_res = pt.Utils.evaluate(JM_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@1000])


{'P@1': 0.0,
 'P@10': 0.016,
 'nDCG@5': 0.013216795894527678,
 'R@20': 0.0009395973154362416,
 'R@1000': 0.007986577181208054}

In [23]:

# Here, we are evaluating BM25 retrieval model
eval = pt.Utils.evaluate(bm25_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@100])
eval

  eval = pt.Utils.evaluate(bm25_res,qrels[['qid','docno','label']],metrics =[P@1,P@10,NDCG@5, R@20,R@100])


{'P@1': 0.02,
 'P@10': 0.012,
 'nDCG@5': 0.014454531452899037,
 'R@20': 0.00087248322147651,
 'R@100': 0.005771812080536913}

In [24]:

# show the results to togethor with all models
pt.Experiment(
[JM_retr,JM_retr_highLambda ,bm25_retr, tfidf_retr],
queriesDF,
qrels,
eval_metrics=["map","recall","P"],
names=["JM_retr","JM_retr_highLambda","bm25_retr","tfidf_retr"]
)


Unnamed: 0,name,map,R@5,R@10,R@15,R@20,R@30,R@100,R@200,R@500,R@1000,P@5,P@10,P@15,P@20,P@30,P@100,P@200,P@500,P@1000
0,JM_retr,0.006477,0.000268,0.000537,0.000738,0.00094,0.001409,0.005973,0.007987,0.007987,0.007987,0.016,0.016,0.014667,0.014,0.014,0.0178,0.0119,0.00476,0.00238
1,JM_retr_highLambda,0.006477,0.000268,0.000537,0.000738,0.00094,0.001409,0.005973,0.007987,0.007987,0.007987,0.016,0.016,0.014667,0.014,0.014,0.0178,0.0119,0.00476,0.00238
2,bm25_retr,0.006186,0.000201,0.000403,0.000671,0.000872,0.001342,0.005772,0.007987,0.007987,0.007987,0.012,0.012,0.013333,0.013,0.013333,0.0172,0.0119,0.00476,0.00238
3,tfidf_retr,0.006186,0.000201,0.000403,0.000671,0.000872,0.001342,0.005772,0.007987,0.007987,0.007987,0.012,0.012,0.013333,0.013,0.013333,0.0172,0.0119,0.00476,0.00238


### **References**


* [PyTerrier  retrieval and evaluation notebook](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb).
*   [PyTerrier documentation.](https://pyterrier.readthedocs.io/_/downloads/en/latest/pdf/)

* Tamer Elsayed, Lectures notes and Labs, Qatar University  
