# Information retrieval

## Boolean Model of IR



# Load Dataset

In [15]:
!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git
!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'

Cloning into 'Useful-Corpora-for-Text-Mining-in-Persian-Language'...
remote: Enumerating objects: 18, done.[K
remote: Total 18 (delta 0), reused 0 (delta 0), pack-reused 18[K
Unpacking objects: 100% (18/18), 478.78 MiB | 10.52 MiB/s, done.
Updating files: 100% (8/8), done.

UNRAR 5.61 beta 1 freeware      Copyright (c) 1993-2018 Alexander Roshal


Extracting from /content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar

Extracting  farsnews.json                                                  0%  1%  2%  3%  4%  5%  6%  7%  8%  9% 10% 11% 12% 13% 14% 15% 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31% 32% 33% 34% 35% 36% 37% 38%

Extracting from /content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part02.rar

...         farsnews.json        

In [16]:
import json
news = []
counter = 1
for line in open('/content/farsnews.json', 'r', encoding='utf-8-sig'):
  #we only save news body to avoid fullfilling ram
  temp = dict()
  doc = json.loads(line)
  if('NewsBody' not in doc.keys()):
    continue
  temp['NewsBody'] = doc['NewsBody']
  temp['id'] = counter
  counter += 1
  news.append(temp) 


# Pre-processing Pipeline

In [9]:
import pandas as pd
# will be used to remove stopwords from tokens (is not used in this assignment)
stopwords = pd.read_fwf('https://raw.githubusercontent.com/sobhe/hazm/master/hazm/data/stopwords.dat', header=None)[0].to_list()

In [8]:
import re
from string import punctuation as punctuation_str

def remove_symbols_and_numbers(content):
  end_of_msg = ('انتهای پیام', 'انتهای‌پیام', '\r\nانتهای\r\nپیام/ک')
  for w in end_of_msg:
      if w in content[-200:]:
          i = content[-200:].find(w)
          content = content[:-(200-i)-1]

  # Remove punctuation & Numbers
 
  content = re.sub(f'[{punctuation_str}؟!،,?،٪×÷»«><]', '', content)
  content = re.sub(f'[0123456789۰١۱۲۳۴۵۶۷۸۹؛–_‘]', '', content)
  
  return content

In [10]:
from IPython.display import clear_output 

def pipeline(text, normalizer, tokenizer, stemmer, method_name, id):
  result = dict()
  
  no_symbol_text = remove_symbols_and_numbers(text)
  normalized_text = normalizer(no_symbol_text)

  tokens = tokenizer(normalized_text)
  # tokens = filter(lambda t: t not in stopwords, tokens)
  tokens = filter(lambda t: len(t) >= 3, tokens)
  tokens = list(tokens)

  stems = [stemmer(word) for word in tokens]

  # removed to avoid ram occupation

  # result['original'] = text
  # result['no_symbol'] = no_symbol_text
  # result['normalized'] = normalized_text
  # result['tokens'] = tokens
  result['stems'] = stems
  # result['method'] = method_name
  result['id'] = id
  # clear_output()
  # print(id)
  return result

# Install Hazm and Preprocessing

In [11]:
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.7/316.7 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394484 sha256=8af9dae7c8fb375669c29

In [12]:
from hazm import Normalizer, WordTokenizer, Stemmer
hzam_normalizer = Normalizer()
hazm_tokenizer = WordTokenizer()
hazm_stemmer = Stemmer()

In [17]:
results = [pipeline(news[i]['NewsBody'], hzam_normalizer.normalize, hazm_tokenizer.tokenize, hazm_stemmer.stem, 'hazm', i) for i in range(0, 1000)]

# Posting List





In [18]:

def posting_list(pre_processed_text):
  posting_list = dict()
  for i in range(0, len(pre_processed_text)):
    token_freq_in_doc = dict()
    for token in pre_processed_text[i]['stems']:
      if token not in token_freq_in_doc.keys():
        token_freq_in_doc[token] = 0
      token_freq_in_doc[token] += 1
      if token in posting_list.keys():
        posting_list[token]['freq'] += 1
      else:
        posting_list[token] = dict()
        posting_list[token]['freq'] = 1
    
    for token, freq in token_freq_in_doc.items():
      if 'doc-tf' not in posting_list[token].keys():
        posting_list[token]['doc-tf'] = list()
      
      posting_list[token]['doc-tf'].append((i, freq))
  return posting_list

In [19]:
pl = posting_list(results)

for token, info in pl.items():
  print(f"{token} -> freq: {info['freq']}, posting list: {info['doc-tf']}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
مجار -> freq: 1, posting list: [(447, 1)]
هلال‌احمر -> freq: 14, posting list: [(447, 3), (488, 5), (995, 6)]
احمر -> freq: 21, posting list: [(447, 7), (488, 3), (949, 10), (995, 1)]
ششطراز -> freq: 1, posting list: [(447, 1)]
برگزارشد -> freq: 1, posting list: [(447, 1)]
امدادرسان -> freq: 4, posting list: [(447, 3), (732, 1)]
رضاپور -> freq: 1, posting list: [(447, 1)]
کندر -> freq: 1, posting list: [(447, 1)]
افزوده_شده_اس -> freq: 1, posting list: [(447, 1)]
دیدگ -> freq: 2, posting list: [(447, 1), (771, 1)]
دادرس -> freq: 2, posting list: [(447, 1), (799, 1)]
فراگرفته‌اند -> freq: 1, posting list: [(447, 1)]
کارو -> freq: 18, posting list: [(447, 2), (479, 14), (822, 1), (936, 1)]
خبرداد -> freq: 2, posting list: [(447, 1), (610, 1)]
بازآموز -> freq: 3, posting list: [(447, 1), (488, 1), (679, 1)]
بردسکندرنشس -> freq: 1, posting list: [(447, 1)]
قاسمپور -> freq: 2, posting list: [(447, 2)]
م‌یدهد -> freq: 1, postin

In [20]:
len(pl.keys())

15789

# Boolean Model

In [21]:
def binary_matrix(posting_list):
  vectors = list();
  
  for token, info in posting_list.items():
    vector = list();
    # obtain the documents that include the token, using its posting list
    docs = [tup[0] for tup in info['doc-tf']]

    # initially fill the binary vector of the token with zeros
    for i in range(1000):
      vector.append(0)
    
    # replace zeros with ones according to the docs list
    for i in range (len(docs)):
      vector[docs[i]] = 1
    
    vectors.append(vector)
    # print(f"{token} -> vector: {vector}")
  return vectors

In [22]:
import pandas as pd

docIds = pd.Series(range(0,1000))
df = pd.DataFrame(columns=docIds, data=binary_matrix(pl))
df.index = [*pl.keys()]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
محمد,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
نور,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
گف,1,0,1,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,0,0
وگو,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
خبرنگار,1,0,0,1,1,0,0,0,1,0,...,1,1,0,1,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
موکد,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
راهرو,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
مدبر,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
سرجیو,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
import functools

def boolean_model_IR(query):
  items = query.split(".")
  items = [Stemmer().stem(item) for item in items]
  vectors = list()
  result_docs = list()

  for item in items:
    if item.startswith('!') or item.endswith('!'):
      item = item.replace('!', '')
      # inverse the item's vector
      vector = 1 - df.loc[item]
    else:
      vector = df.loc[item]

    vectors.append(vector)

  # get the result vector by applying AND operator on items' vectors 
  result = functools.reduce(lambda a, b: a & b, vectors)

  # get the result documents
  for i in range(len(result)):
    if (result[i]):
      result_docs.append(i)

  return result_docs

## ًQueries

In [24]:
queries = {
'Q1': "ایران.!کشاورز.تیم",
'Q2': "فوتبال.تیم.قهرمان",
'Q3': "کشتی.!فرهنگی.مدال",
'Q4': "شبکه.!فروشگاه.!نمایش.فناوری.!کشاورز",
'Q5': "دینی.!مدارس.حجاب", 
'Q6': "کاراته.!پسر.رزومه",
'Q7': "آمریکا.!دموکراسی.ونزوئلا.!رژیم",
'Q8' : "پزشکی.!رفاه.!مدیریت.!کارمند.!علوم.سلامت.بیمار",
'Q9' : "آسفالت.خیابان.خودرو.ترافیک",
'Q10' : "لیگ.بازی.!حذفی.!استقلال.امتیاز.!برنده.!مهاجم",
'Q11' : "علم.مالی.!سیاست.محققان",
'Q12' : "آذربایجان.!جنگ.!پیروزی.نظامی.ایران",
'Q13' : "دانشگاه.دولتی.!آزاد.آزمون",
'Q14' : "توریست.طبیعی.گردشگری.تاریخی",
'Q15' : "وزارت.تعاون.کار.رفاه.اجتماعی",
'Q16' : "سپاه.انقلاب.جنگ.دولت"
}

In [25]:
for q_id, q_items in queries.items():
  print(f"{q_id} : {','.join(map(str, boolean_model_IR(q_items)))}")

Q1 : 45,47,55,66,74,104,142,226,242,270,289,322,328,340,352,368,395,428,436,441,461,482,556,557,623,642,673,687,690,728,737,761,789,811,822,855,891,892,910,919,954,984,986
Q2 : 156,176,270,352,458,473,711,763,855
Q3 : 557,761,789
Q4 : 1,186,330,531,979
Q5 : 622,967
Q6 : 44
Q7 : 385,512,646,659,839,856
Q8 : 65,282,324,787
Q9 : 
Q10 : 70,482,535,542,551,711,720,774,898
Q11 : 910
Q12 : 
Q13 : 
Q14 : 968
Q15 : 449,569,664,752,940
Q16 : 330,442


In [26]:
with open ('/content/output.txt', 'w') as file:
  for q_id, q_items in queries.items():
    file.write(f"{q_id} : {','.join(map(str, boolean_model_IR(q_items)))}\n")