In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade --force-reinstall torch transformers


[0mCollecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (fro

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import json
from typing import List
import re

# Loading and preprocessing data
def load_and_preprocess_data(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)

    # Combining text fields for comprehensive representation
    df['combined_text'] = df['case_summary'].fillna('') + ' ' + df['full_text'].fillna('')

    # Cleaning text: removing special characters, extra spaces
    def clean_text(text: str) -> str:
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    df['combined_text'] = df['combined_text'].apply(clean_text)
    return df

# TF-IDF Vectorization
def get_tfidf_vectors(texts: List[str]) -> np.ndarray:
    vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

# BERT Embeddings using IndoBERT
def get_bert_embeddings(texts: List[str], model_name: str = "indobenchmark/indobert-base-p1") -> np.ndarray:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)

    return np.array(embeddings)

# Retrieval function
def retrieve(query: str, df: pd.DataFrame, tfidf_vectorizer, tfidf_matrix: np.ndarray,
             bert_embeddings: np.ndarray, k: int = 5, use_bert: bool = True) -> List[int]:
    # Preprocess query
    query_clean = re.sub(r'[^\w\s]', ' ', query.lower())
    query_clean = re.sub(r'\s+', ' ', query_clean).strip()

    # Compute query vector
    if use_bert:
        tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
        model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        inputs = tokenizer(query_clean, return_tensors="pt", max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            query_vector = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
        query_vector = query_vector.reshape(1, -1)
        similarities = cosine_similarity(query_vector, bert_embeddings)[0]
    else:
        query_vector = tfidf_vectorizer.transform([query_clean]).toarray()
        similarities = cosine_similarity(query_vector, tfidf_matrix)[0]

    # Get top-k case IDs
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    top_k_case_ids = df.iloc[top_k_indices]['case_id'].tolist()

    return top_k_case_ids

# Main execution
def main():
    # Load data
    file_path = "/content/drive/MyDrive/Tugasbesar/data/processed/cases.csv"
    df = load_and_preprocess_data(file_path)

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # TF-IDF Vectorization
    tfidf_vectorizer, tfidf_matrix = get_tfidf_vectors(train_df['combined_text'].tolist())

    # Train SVM on TF-IDF
    svm = SVC(kernel='linear')
    # For simplicity, using case_id as pseudo-labels (in practice, use actual labels if available)
    svm.fit(tfidf_matrix, train_df['case_id'])

    # BERT Embeddings
    bert_embeddings = get_bert_embeddings(train_df['combined_text'].tolist())

    # Test retrieval
    test_queries = [
        {"query": "Sengketa perdata mengenai hutang piutang dengan akta perdamaian", "ground_truth": 1},
        {"query": "Wanprestasi pembayaran berdasarkan perjanjian penggunaan mekari flex", "ground_truth": 2},
        {"query": "Gugatan perdata dengan tunggakan hutang besar", "ground_truth": 3},
        {"query": "Pencabutan gugatan perdata setelah mediasi gagal", "ground_truth": 127},
        {"query": "Perkara perdata dengan bukti transfer dan pengakuan hutang", "ground_truth": 3}
    ]

    # Save queries to JSON
    queries_path = "/content/drive/MyDrive/Tugasbesar/data/eval/queries.json"
    with open(queries_path, 'w', encoding='utf-8') as f:
        json.dump(test_queries, f, ensure_ascii=False, indent=2)

    # Test retrieval function
    for query_info in test_queries:
        query = query_info["query"]
        ground_truth = query_info["ground_truth"]
        result_bert = retrieve(query, train_df, tfidf_vectorizer, tfidf_matrix, bert_embeddings, k=5, use_bert=True)
        result_tfidf = retrieve(query, train_df, tfidf_vectorizer, tfidf_matrix, bert_embeddings, k=5, use_bert=False)
        print(f"Query: {query}")
        print(f"Ground Truth: {ground_truth}")
        print(f"BERT Top-5: {result_bert}")
        print(f"TF-IDF Top-5: {result_tfidf}")
        print()

if __name__ == "__main__":
    main()

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Query: Sengketa perdata mengenai hutang piutang dengan akta perdamaian
Ground Truth: 1
BERT Top-5: [117, 39, 29, 67, 73]
TF-IDF Top-5: [49, 118, 93, 84, 54]

Query: Wanprestasi pembayaran berdasarkan perjanjian penggunaan mekari flex
Ground Truth: 2
BERT Top-5: [113, 117, 42, 14, 29]
TF-IDF Top-5: [2, 93, 42, 39, 113]

Query: Gugatan perdata dengan tunggakan hutang besar
Ground Truth: 3
BERT Top-5: [29, 31, 118, 67, 113]
TF-IDF Top-5: [118, 23, 22, 9, 86]

Query: Pencabutan gugatan perdata setelah mediasi gagal
Ground Truth: 127
BERT Top-5: [117, 29, 13, 39, 118]
TF-IDF Top-5: [25, 52, 127, 87, 47]

Query: Perkara perdata dengan bukti transfer dan pengakuan hutang
Ground Truth: 3
BERT Top-5: [29, 71, 117, 67, 31]
TF-IDF Top-5: [118, 53, 60, 80, 71]

