In [7]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.26-cp311-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.22-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.23.26-cp311-none-win_amd64.whl (3.4 MB)
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB 186.2 kB/s eta 0:00:19
   ---------------------------------------- 0.0/3.4 MB 186.2 kB/s eta 0:00:19
   ---------------------------------------- 0.0/3.4 MB 151.3 kB/s eta 0:00:23
   ---------------------------------------- 0.0/3.4 MB 151.3 kB/s eta 0:00:23
    --------------------------------------- 0.1/3.4 MB 181.6 kB/s eta 0:00:19
   - -------------------------------------- 0.1/3.4 MB 227.6 kB/s eta 0:00:15
   - -------------------------------------- 0.1/3.4 MB 

In [8]:
import os
import re
import pickle
import fitz  

In [49]:
def urdu_tokenize(text):
    urdu_range = r'\u0600-\u06FF'
    pattern = f'[{urdu_range}]+'
    tokens = re.findall(pattern, text)
    return tokens


In [50]:
def create_index(corpus_dir, stop_words):
    index = {}
    postings = {}
    
    for filename in os.listdir(corpus_dir):
        filepath = os.path.join(corpus_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:  
            document = file.read()
            terms = urdu_tokenize(document)
            terms = [term for term in terms if term not in stop_words]
            for position, term in enumerate(terms):
                if term not in index:
                    index[term] = {'document_frequency': 0, 'postings': {}}
                if filename not in index[term]['postings']:
                    index[term]['postings'][filename] = []
                index[term]['document_frequency'] += 1
                index[term]['postings'][filename].append(position)
    
    return index


In [67]:
def write_files(index, index_file, postings_file):
    with open(index_file, 'wb') as file:
        pickle.dump(index, file)
    index_txt_file='index.txt'
    print(f"Index written to {index_file}")
    with open(index_txt_file, 'w', encoding='utf-8') as txt_file:
        for term, data in index.items():
            txt_file.write(f"{term}: {data['document_frequency']}\n")
            for doc, positions in data['postings'].items():
                txt_file.write(f"  {doc}: {positions}\n")
    print(f"Index written to {index_txt_file}")
    with open(postings_file, 'w', encoding='utf-8') as file:
        for term, data in index.items():
            for doc, positions in data['postings'].items():
                file.write(f"{term}: {doc} {positions}\n")
    print(f"Postings written to {postings_file}")

In [68]:
def main():
    corpus_dir = 'UrduCorpus'
    stop_words_file = 'Closed_Class_Word_List.pdf'
    index_file = 'urdu_index.pkl'
    postings_file = 'urdu_postings.txt'
    stop_words = extract_stop_words(stop_words_file)
    #print(f"Stop words: {stop_words}")
    index = create_index(corpus_dir, stop_words)
    #print(index)
    write_files(index, index_file, postings_file)
if __name__ == "__main__":
    main()


Index written to urdu_index.pkl
Index written to index.txt
Postings written to urdu_postings.txt


you can give take inout from user and search for that term in index or you can search each term from test file and see results.

In [80]:
import pickle

def read_index(index_file):
    with open(index_file, 'rb') as file:
        index = pickle.load(file)
    return index
def search_term(index, term):
    if term in index:
        print(f"Term: {term}")
        print(f'Term Freq: {len(index[term]["postings"])}')
        print("Postings:")
        for doc, positions in index[term]['postings'].items():
            print(f"  DocID: {doc}, Positions: {positions}")
    else:
        print(f"Term '{term}' not found in the inverted file.")

def test(flag):
    index_file = 'urdu_index.pkl'
    postings_file = 'urdu_postings.txt'
    index = read_index(index_file)
    if(flag):
        while True:
            term = input("Enter a term (or type 'exit' to quit): ").strip()
            if term == 'exit':
                break
            search_term(index, term)
    else:
        with open('Test.txt', 'r', encoding='utf-8') as file:
            text = file.read()
        words = text.split()
        results = []
        for word in words:
            search_term(index, word)

test(0) #pass zero  to test on test.txt pass 1 to test on user input


Term: سعودی
Term Freq: 4
Postings:
  DocID: News11.txt, Positions: [0, 142, 153, 183]
  DocID: News19.txt, Positions: [0, 387, 398, 428]
  DocID: News20.txt, Positions: [376, 387, 417]
  DocID: News5.txt, Positions: [0, 387, 398, 428]
Term: عرب
Term Freq: 16
Postings:
  DocID: News11.txt, Positions: [1, 143, 154, 184]
  DocID: News19.txt, Positions: [1, 388, 399, 429]
  DocID: News20.txt, Positions: [1, 377, 388, 418]
  DocID: News5.txt, Positions: [1, 388, 399, 429]
  DocID: Sports2.txt, Positions: [13, 57, 78, 193, 223, 287, 392]
  DocID: Sports28.txt, Positions: [21]
  DocID: Sports29.txt, Positions: [21]
  DocID: Sports30.txt, Positions: [13, 57, 78, 190, 220, 283, 387]
  DocID: Sports31.txt, Positions: [21]
  DocID: Sports35.txt, Positions: [13, 73, 88, 106, 194]
  DocID: Sports4.txt, Positions: [21]
  DocID: Sports40.txt, Positions: [13, 57, 78, 193, 223, 287, 392]
  DocID: Technology1.txt, Positions: [583]
  DocID: Technology12.txt, Positions: [583]
  DocID: Technology13.txt, Po