In [1]:
!pip install pymupdf transformers sentence-transformers

from google.colab import files
print("Please upload your HR PDF files.")
uploaded = files.upload()
pdf_filenames = list(uploaded.keys())
print(f"Uploaded files: {pdf_filenames}")

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-many

Saving bKash HR Service Rules Handbook 2025.pdf to bKash HR Service Rules Handbook 2025.pdf
Saving Employee Travel and Transfer Guideline.pdf to Employee Travel and Transfer Guideline.pdf
Saving Group Anti-Bribery and Corruption Policy.pdf to Group Anti-Bribery and Corruption Policy.pdf
Saving Workplace Anti Harassment Guideline.pdf to Workplace Anti Harassment Guideline.pdf
Uploaded files: ['bKash HR Service Rules Handbook 2025.pdf', 'Employee Travel and Transfer Guideline.pdf', 'Group Anti-Bribery and Corruption Policy.pdf', 'Workplace Anti Harassment Guideline.pdf']


In [2]:
import fitz
import os

extracted_txt_files = []

for file_name in pdf_filenames:
    try:
        doc = fitz.open(file_name)
        text = ""
        for page in doc:
            text += page.get_text()

        txt_filename = file_name.replace(".pdf", ".txt")
        with open(txt_filename, "w", encoding="utf-8") as f:
            f.write(text)
        extracted_txt_files.append(txt_filename)
        print(f"Extracted text from {file_name} to {txt_filename}")
    except Exception as e:
        print(f"Error extracting text from {file_name}: {e}")

print("Text extraction complete.")

Extracted text from bKash HR Service Rules Handbook 2025.pdf to bKash HR Service Rules Handbook 2025.txt
Extracted text from Employee Travel and Transfer Guideline.pdf to Employee Travel and Transfer Guideline.txt
Extracted text from Group Anti-Bribery and Corruption Policy.pdf to Group Anti-Bribery and Corruption Policy.txt
Extracted text from Workplace Anti Harassment Guideline.pdf to Workplace Anti Harassment Guideline.txt
Text extraction complete.


In [3]:
import re

def clean_text(text):
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ ]{2,}', ' ', text)
    text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
    text = text.replace('\n', ' ')
    return text.strip()

cleaned_txt_files = []

for file_name in extracted_txt_files:
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            raw_text = f.read()

        cleaned_text_content = clean_text(raw_text)
        output_name = file_name.replace(".txt", "_cleaned.txt")

        with open(output_name, "w", encoding="utf-8") as f:
            f.write(cleaned_text_content)
        cleaned_txt_files.append(output_name)
        print(f"Cleaned text from {file_name} to {output_name}")
    except Exception as e:
        print(f"Error cleaning text from {file_name}: {e}")

print("Text cleaning complete.")

Cleaned text from bKash HR Service Rules Handbook 2025.txt to bKash HR Service Rules Handbook 2025_cleaned.txt
Cleaned text from Employee Travel and Transfer Guideline.txt to Employee Travel and Transfer Guideline_cleaned.txt
Cleaned text from Group Anti-Bribery and Corruption Policy.txt to Group Anti-Bribery and Corruption Policy_cleaned.txt
Cleaned text from Workplace Anti Harassment Guideline.txt to Workplace Anti Harassment Guideline_cleaned.txt
Text cleaning complete.


In [4]:
import json

def split_into_chunks_by_tokens(text, tokenizer, max_tokens=300, overlap=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk_tokens = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk_tokens))
    return chunks

from transformers import AutoTokenizer
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

chunked_data = {}
chunk_files = []

CHUNK_SIZE_TOKENS = 512
CHUNK_OVERLAP_TOKENS = 50

for file_name in cleaned_txt_files:
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = split_into_chunks_by_tokens(text, tokenizer, max_tokens=CHUNK_SIZE_TOKENS, overlap=CHUNK_OVERLAP_TOKENS)
        chunked_data[file_name] = chunks

        out_file = file_name.replace("_cleaned.txt", "_chunks.json")
        with open(out_file, "w", encoding="utf-8") as f:
            json.dump(chunks, f, ensure_ascii=False, indent=2)
        chunk_files.append(out_file)
        print(f"Chunked text from {file_name} into {len(chunks)} chunks and saved to {out_file}")
    except Exception as e:
        print(f"Error chunking text from {file_name}: {e}")

print("Text chunking complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1572 > 512). Running this sequence through the model will result in indexing errors


Chunked text from bKash HR Service Rules Handbook 2025_cleaned.txt into 4 chunks and saved to bKash HR Service Rules Handbook 2025_chunks.json
Chunked text from Employee Travel and Transfer Guideline_cleaned.txt into 5 chunks and saved to Employee Travel and Transfer Guideline_chunks.json
Chunked text from Group Anti-Bribery and Corruption Policy_cleaned.txt into 35 chunks and saved to Group Anti-Bribery and Corruption Policy_chunks.json
Chunked text from Workplace Anti Harassment Guideline_cleaned.txt into 8 chunks and saved to Workplace Anti Harassment Guideline_chunks.json
Text chunking complete.


In [5]:
import torch
from sentence_transformers import SentenceTransformer

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(MODEL_NAME)

all_chunks = []
for chunks in chunked_data.values():
    all_chunks.extend(chunks)

print(f"Generating embeddings for a total of {len(all_chunks)} chunks...")

chunk_embeddings = model.encode(all_chunks, convert_to_tensor=True, show_progress_bar=True)

print(f"Generated embeddings of shape: {chunk_embeddings.shape}")

embeddings_output_file = 'all_chunks_embeddings.pt'
torch.save(chunk_embeddings, embeddings_output_file)

print(f"All chunk embeddings saved to {embeddings_output_file}")
embedding_source_map = []
current_embedding_index = 0
for filename, chunks in chunked_data.items():
    for i in range(len(chunks)):
        embedding_source_map.append({
            'source_file': filename,
            'chunk_index': i,
            'embedding_index': current_embedding_index
        })
        current_embedding_index += 1

source_map_output_file = 'embedding_source_map.json'
with open(source_map_output_file, 'w', encoding='utf-8') as f:
    json.dump(embedding_source_map, f, ensure_ascii=False, indent=2)

print(f"Embedding source map saved to {source_map_output_file}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for a total of 52 chunks...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Generated embeddings of shape: torch.Size([52, 384])
All chunk embeddings saved to all_chunks_embeddings.pt
Embedding source map saved to embedding_source_map.json


In [6]:
from google.colab import files

print("Downloading important files:")

for file in cleaned_txt_files:
    try:
        files.download(file)
        print(f"Downloaded: {file}")
    except Exception as e:
        print(f"Error downloading {file}: {e}")

for file in chunk_files:
    try:
        files.download(file)
        print(f"Downloaded: {file}")
    except Exception as e:
        print(f"Error downloading {file}: {e}")

try:
    files.download(embeddings_output_file)
    print(f"Downloaded: {embeddings_output_file}")
except Exception as e:
    print(f"Error downloading {embeddings_output_file}: {e}")

try:
    files.download(source_map_output_file)
    print(f"Downloaded: {source_map_output_file}")
except Exception as e:
    print(f"Error downloading {source_map_output_file}: {e}")

print("Download process complete.")

Downloading important files:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: bKash HR Service Rules Handbook 2025_cleaned.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Employee Travel and Transfer Guideline_cleaned.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Group Anti-Bribery and Corruption Policy_cleaned.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Workplace Anti Harassment Guideline_cleaned.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: bKash HR Service Rules Handbook 2025_chunks.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Employee Travel and Transfer Guideline_chunks.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Group Anti-Bribery and Corruption Policy_chunks.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: Workplace Anti Harassment Guideline_chunks.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: all_chunks_embeddings.pt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: embedding_source_map.json
Download process complete.
