In [1]:
# Uncomment the following lines if `transformers` or `torch` is not available
!pip install --quiet transformers torch chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━

In [2]:

import json
import pickle
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Use a retrieval‑optimised model (E5 base)
MODEL_NAME = 'intfloat/e5-base-v2'

# Chunk parameters
CHUNK_SIZE = 128        # tokens per chunk
CHUNK_OVERLAP = 32      # token overlap between chunks

# Prefixes for E5 model
PASSAGE_PREFIX = 'passage: '
QUERY_PREFIX = 'query: '

# Paths
RAW_PATH = Path('/kaggle/input/ukyfkyugj/hajjjjjjjj.txt')  # adjust if needed
JSON_OUTPUT = Path('hajj_chunks_e5.json')
PKL_OUTPUT = Path('hajj_chunks_e5.pkl')
NPY_OUTPUT = Path('hajj_embeddings_e5.npy')


In [4]:

# Load the tokenizer and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# Read raw text
text = RAW_PATH.read_text(encoding='utf-8').strip()


In [5]:

# Tokenize entire document (for splitting by tokens)
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
tokens = inputs['input_ids'][0]
total_tokens = len(tokens)
print(f"Total tokens: {total_tokens}")

# Generate chunk boundaries
step = CHUNK_SIZE - CHUNK_OVERLAP
chunks = []
embeddings = []

for idx, start in enumerate(range(0, total_tokens, step)):
    end = min(start + CHUNK_SIZE, total_tokens)
    chunk_tokens = tokens[start:end]
    # Decode to text
    chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
    # Prefix for passage
    input_text = PASSAGE_PREFIX + chunk_text
    encoded_input = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        out = model(**encoded_input)
        # Mean pooling
        token_embeds = out.last_hidden_state  # (1, seq_len, hidden_dim)
        mask = encoded_input['attention_mask'].unsqueeze(-1)
        sum_embeds = (token_embeds * mask).sum(dim=1)
        sum_mask = mask.sum(dim=1)
        embed = (sum_embeds / sum_mask).squeeze(0).cpu().numpy()
    # Normalize embedding
    norm = np.linalg.norm(embed)
    if norm > 0:
        embed = embed / norm
    # Append
    chunks.append({
        'chunk_id': idx,
        'start_token': int(start),
        'end_token': int(end),
        'text': chunk_text
    })
    embeddings.append(embed)
    if end == total_tokens:
        break

print(f"Generated {len(chunks)} chunks.")


Token indices sequence length is longer than the specified maximum sequence length for this model (38365 > 512). Running this sequence through the model will result in indexing errors


Total tokens: 38365
Generated 400 chunks.


In [6]:

# Save JSON and pickle
with JSON_OUTPUT.open('w', encoding='utf-8') as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)
print(f"Saved {len(chunks)} chunks to {JSON_OUTPUT.resolve()}")

with PKL_OUTPUT.open('wb') as f:
    pickle.dump(chunks, f)
print(f"Saved pickle to {PKL_OUTPUT.resolve()}")

# Save embeddings
emb_array = np.vstack(embeddings)
np.save(NPY_OUTPUT, emb_array)
print(f"Saved embeddings array to {NPY_OUTPUT.resolve()} with shape {emb_array.shape}")


Saved 400 chunks to /kaggle/working/hajj_chunks_e5.json
Saved pickle to /kaggle/working/hajj_chunks_e5.pkl
Saved embeddings array to /kaggle/working/hajj_embeddings_e5.npy with shape (400, 768)
