In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import json
from typing import List
import re

# Load and preprocess data
def load_and_preprocess_data(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    df['combined_text'] = df['case_summary'].fillna('') + ' ' + df['full_text'].fillna('')
    def clean_text(text: str) -> str:
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    df['combined_text'] = df['combined_text'].apply(clean_text)
    # Extract solutions (using legal_basis or case_summary as proxy)
    df['solution'] = df['legal_basis'].fillna(df['case_summary'])
    return df

# TF-IDF Vectorization
def get_tfidf_vectors(texts: List[str]) -> np.ndarray:
    vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

# BERT Embeddings using IndoBERT
def get_bert_embeddings(texts: List[str], model_name: str = "indobenchmark/indobert-base-p1") -> np.ndarray:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

# Retrieval function (from previous artifact)
def retrieve(query: str, df: pd.DataFrame, tfidf_vectorizer, tfidf_matrix: np.ndarray,
             bert_embeddings: np.ndarray, k: int = 5, use_bert: bool = True) -> List[int]:
    query_clean = re.sub(r'[^\w\s]', ' ', query.lower())
    query_clean = re.sub(r'\s+', ' ', query_clean).strip()
    if use_bert:
        tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
        model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        inputs = tokenizer(query_clean, return_tensors="pt", max_length=512, truncation=True, padding=True)
        with torch.no_grad():
            query_vector = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
        query_vector = query_vector.reshape(1, -1)
        similarities = cosine_similarity(query_vector, bert_embeddings)[0]
    else:
        query_vector = tfidf_vectorizer.transform([query_clean]).toarray()
        similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    top_k_case_ids = df.iloc[top_k_indices]['case_id'].tolist()
    top_k_similarities = similarities[top_k_indices]
    return top_k_case_ids, top_k_similarities

# Predict outcome function
def predict_outcome(query: str, df: pd.DataFrame, tfidf_vectorizer, tfidf_matrix: np.ndarray,
                    bert_embeddings: np.ndarray, k: int = 5) -> tuple:
    top_k_case_ids, top_k_similarities = retrieve(query, df, tfidf_vectorizer, tfidf_matrix, bert_embeddings, k)
    solutions = [df[df['case_id'] == cid]['solution'].iloc[0] for cid in top_k_case_ids]
    # Weighted similarity: select solution with highest cumulative similarity score
    solution_scores = {}
    for sol, sim in zip(solutions, top_k_similarities):
        if sol in solution_scores:
            solution_scores[sol] += sim
        else:
            solution_scores[sol] = sim
    predicted_solution = max(solution_scores, key=solution_scores.get)
    return predicted_solution, top_k_case_ids

# Main execution
def main():
    # Load data
    file_path = "/content/drive/MyDrive/Tugasbesar/data/processed/cases.csv"
    df = load_and_preprocess_data(file_path)

    # Get TF-IDF and BERT vectors
    tfidf_vectorizer, tfidf_matrix = get_tfidf_vectors(df['combined_text'].tolist())
    bert_embeddings = get_bert_embeddings(df['combined_text'].tolist())

    # Demo queries
    demo_queries = [
        {"query_id": 1, "query": "Sengketa perdata mengenai hutang piutang dengan penyelesaian damai", "ground_truth": "Para pihak dihukum untuk mentaati akta perdamaian"},
        {"query_id": 2, "query": "Wanprestasi pembayaran atas perjanjian investasi", "ground_truth": "Tergugat dinyatakan wanprestasi dan wajib membayar hutang"},
        {"query_id": 3, "query": "Gugatan perdata dengan bukti transfer dan pengakuan hutang", "ground_truth": "Tergugat wajib membayar hutang pokok beserta bunga"},
        {"query_id": 4, "query": "Pencabutan gugatan setelah mediasi tidak berhasil", "ground_truth": "Perkara dicabut dari register"},
        {"query_id": 5, "query": "Sengketa perdata dengan bukti setoran bank", "ground_truth": "Tergugat dinyatakan wanprestasi dan wajib membayar hutang"}
    ]

    # Run predictions
    results = []
    for query_info in demo_queries:
        query_id = query_info['query_id']
        query = query_info['query']
        ground_truth = query_info['ground_truth']
        predicted_solution, top_k_case_ids = predict_outcome(query, df, tfidf_vectorizer, tfidf_matrix, bert_embeddings)
        print(f"Query ID: {query_id}")
        print(f"Query: {query}")
        print(f"Predicted Solution: {predicted_solution}")
        print(f"Top-5 Case IDs: {top_k_case_ids}")
        print(f"Ground Truth: {ground_truth}")
        print()
        results.append({
            "query_id": query_id,
            "predicted_solution": predicted_solution,
            "top_5_case_ids": top_k_case_ids
        })

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv("/content/drive/MyDrive/Tugasbesar/data/results/predictions.csv", index=False)
    print("Results saved to /content/drive/MyDrive/Tugasbesar/data/results/predictions.csv")

if __name__ == "__main__":
    main()

Query ID: 1
Query: Sengketa perdata mengenai hutang piutang dengan penyelesaian damai
Predicted Solution: tidak berlaku pasal 1266 kitabundang-undang hukum perdata, khususnya yang mengatur keharusan untukmengajukan permohonan pembatalan akta melalui pengadilan negeri, dantergugat seketika dan sekaligus mengembalilan seluruh uang perdamaian yangtelah dibayarkan pihak kedua kepada pihak kesatu
Top-5 Case IDs: [78, 117, 39, 45, 29]
Ground Truth: Para pihak dihukum untuk mentaati akta perdamaian

Query ID: 2
Query: Wanprestasi pembayaran atas perjanjian investasi
Predicted Solution: 118 ayat (4) hetherziene indonesisch reglement (hir) menyatakan:“jika ada suatu tempat tinggal yang dipilih dengan surat akta, makapenggugat kalau mau boleh mengajukan kepada ketua pengadilannegeri yang dalam daerah hukumnya terletak tempat tinggal yang dipilihitu
Top-5 Case IDs: [113, 117, 45, 88, 78]
Ground Truth: Tergugat dinyatakan wanprestasi dan wajib membayar hutang

Query ID: 3
Query: Gugatan perdata de