In [9]:
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re
import json
import math
import os
import itertools
from scipy.sparse import csr_matrix

In [10]:
# PATHS

stopwords_path = 'dataset/stopword-id.csv'
acronym_path = 'dataset/acronym.csv'
out_dir = "docs"
out_vocab_path = "docs/vocab.json"
out_idf_path = "docs/idf.f32"

# Actual news data. Suit your need
csv_path = "../data/final_merge_dataset.csv"

In [11]:
# Load stopwords
stopwords_df = pd.read_csv(stopwords_path, header=None)
custom_stopwords = stopwords_df[0].tolist()

# Load acronyms and build replacement dictionary
df_acronym = pd.read_csv(acronym_path)
acronym_dict = dict(zip(df_acronym["acronym"], df_acronym["expansion"]))
acronym_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, acronym_dict.keys())) + r')\b')

# Text preprocessing function
def preprocess_text(text):
    text = str(text)
    
    # Replace acronyms
    text = acronym_pattern.sub(lambda match: acronym_dict[match.group(0)], text)
    
    # Lowercase
    text = text.lower()
    
    # Remove HTML image tags
    text = re.sub(r'<img[^>]*>', '', text)
    
    # Remove mentions, URLs, numbers
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Clean punctuation and excess whitespace
    text = text.replace("b'", "").replace("-", " ")
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove specific unwanted terms
    text = text.replace("img", "").replace("src", "")
    
    return text

In [12]:
# Load CSV into DataFrame
df = pd.read_csv(csv_path)

In [13]:
df.columns

Index(['Judul', 'Waktu', 'Link', 'Content', 'tag1', 'tag2', 'tag3', 'tag4',
       'tag5', 'source'],
      dtype='object')

In [14]:
# PREPROCESS

df["preprocessed_text"] = df["Judul"].apply(preprocess_text)

In [15]:
df.columns

Index(['Judul', 'Waktu', 'Link', 'Content', 'tag1', 'tag2', 'tag3', 'tag4',
       'tag5', 'source', 'preprocessed_text'],
      dtype='object')

In [19]:
# VECTORIZE

vectorizer = TfidfVectorizer(stop_words=custom_stopwords, max_features=1000)
vec = vectorizer.fit_transform(df['preprocessed_text'])

In [20]:
vocab = vectorizer.vocabulary_ 

In [21]:
dict(itertools.islice(vocab.items(), 10))

{'viral': np.int64(980),
 'isu': np.int64(319),
 'hubungan': np.int64(284),
 'kerja': np.int64(406),
 'buruh': np.int64(142),
 'pemilik': np.int64(637),
 'terdampak': np.int64(911),
 'gempa': np.int64(249),
 'masehi': np.int64(513),
 'guncang': np.int64(261)}

In [22]:
# Convert np.int64 â†’ int
vocab_clean = {term: int(idx) for term, idx in vocab.items()}
with open(out_vocab_path, "w") as f:
    json.dump(vocab_clean, f, ensure_ascii=False)

In [23]:
# vectorizer is your fitted TfidfVectorizer
idf = vectorizer.idf_.astype("float32")

# write to a raw Float32 binary file
idf.tofile(out_idf_path)

print("Saved idf.f32 with shape:", idf.shape)

Saved idf.f32 with shape: (1000,)


In [25]:
x = np.fromfile(out_idf_path, dtype=np.float32)

print("Loaded:", x.shape)
print(x[:10])   # first 10 for checking
print(vectorizer.idf_[:10])

Loaded: (1000,)
[7.183689  6.601945  6.8929996 7.397837  6.179785  5.15681   6.3767834
 6.8795767 6.366088  7.038182 ]
[7.18368922 6.60194486 6.89299962 7.3978372  6.17978488 5.15681
 6.37678315 6.8795766  6.36608786 7.03818163]


In [26]:
print(vec.shape)

(80472, 1000)


In [28]:
# Inputs
N, DIM = vec.shape
BYTES_PER_VEC = DIM * 4
TARGET_SHARD_BYTES = 32 * 1024 * 1024  # 32 MB target
vecs_per_shard = max(1, TARGET_SHARD_BYTES // BYTES_PER_VEC)

os.makedirs(out_dir, exist_ok=True)

manifest = {
    "total_vectors": int(N),
    "dim": int(DIM),
    "dtype": "float32",
    "header_size": 16,
    "shards": []
}

def write_header(f, num_vectors, dim, dtype_code=1):
    # magic + num_vectors + dim + dtype_code, all little-endian uint32
    f.write(b'VECT')  # 4 bytes
    f.write(np.uint32(num_vectors).tobytes())
    f.write(np.uint32(dim).tobytes())
    f.write(np.uint32(dtype_code).tobytes())

start = 0
shard_idx = 0
while start < N:
    end = min(N, start + vecs_per_shard)
    count = end - start
    shard_name = f"vectors_{shard_idx:03d}.f32"
    shard_path = os.path.join(out_dir, shard_name)
    print(f"Writing shard {shard_idx}: vectors {start}..{end-1} -> {shard_name} ({count} vectors)")
    with open(shard_path, "wb") as f:
        write_header(f, count, DIM, dtype_code=1)
        # Convert only this slice to dense float32 (row-wise)
        # For memory safety, do it in smaller sub-batches if needed
        batch_size = 1024  # adjust if your memory is tight
        for bstart in range(start, end, batch_size):
            bend = min(end, bstart + batch_size)
            dense = vec[bstart:bend].toarray().astype(np.float32)
            f.write(dense.tobytes(order='C'))
    shard_info = {
        "id": shard_idx,
        "shard": shard_name,
        "url": f"/{shard_name}",  # set as you will serve it
        "start_index": int(start),
        "count": int(count),
        "size_bytes": os.path.getsize(shard_path)
    }
    manifest["shards"].append(shard_info)
    start = end
    shard_idx += 1

manifest_path = os.path.join(out_dir, "manifest.json")
with open(manifest_path, "w", encoding="utf-8") as mf:
    json.dump(manifest, mf, indent=2)

print("Done. Manifest saved to", manifest_path)

Writing shard 0: vectors 0..8387 -> vectors_000.f32 (8388 vectors)
Writing shard 1: vectors 8388..16775 -> vectors_001.f32 (8388 vectors)
Writing shard 2: vectors 16776..25163 -> vectors_002.f32 (8388 vectors)
Writing shard 3: vectors 25164..33551 -> vectors_003.f32 (8388 vectors)
Writing shard 4: vectors 33552..41939 -> vectors_004.f32 (8388 vectors)
Writing shard 5: vectors 41940..50327 -> vectors_005.f32 (8388 vectors)
Writing shard 6: vectors 50328..58715 -> vectors_006.f32 (8388 vectors)
Writing shard 7: vectors 58716..67103 -> vectors_007.f32 (8388 vectors)
Writing shard 8: vectors 67104..75491 -> vectors_008.f32 (8388 vectors)
Writing shard 9: vectors 75492..80471 -> vectors_009.f32 (4980 vectors)
Done. Manifest saved to docs/manifest.json


In [29]:
import json
import os
from pathlib import Path
import pandas as pd

def generate_metadata_and_index(
    df: pd.DataFrame,
    output_dir: str = out_dir,
    meta_filename: str = "metadata.jsonl",
    index_filename: str = "metadata.index"
):
    """
    Create metadata.jsonl and metadata.index for HTTP Range Request lookup.
    """
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    meta_path = out / meta_filename
    index_path = out / index_filename

    byte_offset = 0
    index_lines = []

    with meta_path.open("wb") as f_meta:  # write raw bytes
        for _, row in df.iterrows():
            record = {
                "title": row["Judul"],
                "url": row["Link"],
                "date": row["Waktu"],
            }

            json_line = json.dumps(record, ensure_ascii=False) + "\n"
            encoded = json_line.encode("utf-8")

            # record byte offset
            index_lines.append(str(byte_offset))

            # write jsonl
            f_meta.write(encoded)

            # update offset
            byte_offset += len(encoded)

    # write index file
    index_path.write_text("\n".join(index_lines), encoding="utf-8")

    print(f"Created: {meta_path}")
    print(f"Created: {index_path}")
    print(f"Total records: {len(df)}")

In [30]:
generate_metadata_and_index(df, output_dir=out_dir)

Created: docs/metadata.jsonl
Created: docs/metadata.index
Total records: 80472
