In [3]:
import chromadb
from sentence_transformers import SentenceTransformer
from groq import Groq
import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict
import PyPDF2
import os

class RAGSejarahIndonesia:
    def __init__(self):
        # Inisialisasi model embedding
        self.embedding_model = SentenceTransformer('BAAI/bge-small-en-v1.5')

        # Inisialisasi ChromaDB
        self.chroma_client = chromadb.PersistentClient(path="database_sejarah")
        self.collection = self.chroma_client.get_or_create_collection("sejarah_indonesia")

        # Inisialisasi Groq client
        self.llm_client = Groq(api_key="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")
        # Daftar sumber data
        self.sumber_data = {
            "penjajahan_belanda": "https://id.wikipedia.org/wiki/Hindia_Belanda",

        }

    def ekstrak_teks_dari_url(self, url: str) -> str:
        """Mengekstrak teks artikel dari URL Wikipedia"""
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Ambil semua paragraf dari konten artikel
            paragraphs = soup.find_all('p')
            teks = ' '.join([p.get_text() for p in paragraphs])

            # Bersihkan teks
            teks = re.sub(r'\[\d+\]', '', teks)  # Hapus referensi [1], [2], dst
            return teks
        except Exception as e:
            print(f"Gagal ekstrak dari {url}: {str(e)}")
            return ""

    def proses_dokumen(self, teks: str, chunk_size: int = 500) -> List[str]:
        """Memotong teks menjadi bagian-bagian kecil"""
        words = teks.split()
        chunks = []

        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i+chunk_size])
            chunks.append(chunk)

        return chunks

    def isi_basis_data(self):
        """Mengisi basis data dengan dokumen sejarah"""
        print("Mengisi basis data dari sumber online...")

        all_documents = []
        all_metadata = []

        for judul, url in self.sumber_data.items():
            teks = self.ekstrak_teks_dari_url(url)
            if teks:
                chunks = self.proses_dokumen(teks)

                for i, chunk in enumerate(chunks):
                    all_documents.append(chunk)
                    all_metadata.append({"sumber": url, "judul": judul, "chunk_id": i})

        # Generate embeddings
        embeddings = self.embedding_model.encode(all_documents).tolist()

        # Buat ID unik untuk setiap chunk
        ids = [f"doc_{i}" for i in range(len(all_documents))]

        # Tambahkan ke koleksi ChromaDB
        self.collection.add(
            embeddings=embeddings,
            documents=all_documents,
            metadatas=all_metadata,
            ids=ids
        )

        print(f"Berhasil menambahkan {len(all_documents)} dokumen ke basis data")

    def tambah_dokumen_manual(self, teks: str, metadata: dict = None):
        """Menambahkan dokumen manual ke basis data"""
        chunks = self.proses_dokumen(teks)

        for i, chunk in enumerate(chunks):
            embedding = self.embedding_model.encode(chunk).tolist()
            doc_id = f"manual_{len(self.collection.get()['ids']) + i + 1}"

            meta = metadata.copy() if metadata else {}
            meta["chunk_id"] = i

            self.collection.add(
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[meta],
                ids=[doc_id]
            )

    def cari_dokumen(self, query: str, top_k: int = 3) -> List[Dict]:
        """Mencari dokumen relevan berdasarkan query"""
        query_embedding = self.embedding_model.encode(query).tolist()

        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )

        # Format hasil
        dokumen_relevan = []
        for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
            dokumen_relevan.append({
                "konten": doc,
                "sumber": meta.get("sumber", "tidak diketahui"),
                "judul": meta.get("judul", "tidak diketahui")
            })

        return dokumen_relevan

    def tanya(self, pertanyaan: str) -> str:
        """Menjawab pertanyaan berdasarkan basis pengetahuan"""
        # Cari dokumen relevan
        dokumen_relevan = self.cari_dokumen(pertanyaan)

        if not dokumen_relevan:
            return "Maaf, tidak menemukan informasi yang relevan."

        # Gabungkan konteks
        konteks = "\n\n".join([f"Sumber: {doc['judul']}\n{doc['konten']}" for doc in dokumen_relevan])

        # Format prompt untuk LLM
        prompt = f"""Anda adalah ahli sejarah Indonesia. Jawablah pertanyaan berikut berdasarkan konteks yang diberikan. Jika tidak tahu, katakan tidak tahu.

Konteks:
{konteks}

Pertanyaan: {pertanyaan}
Jawaban:"""

        try:
            # Minta jawaban dari LLM
            response = self.llm_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "Anda adalah asisten yang membantu menjawab pertanyaan tentang sejarah Indonesia."},
                    {"role": "user", "content": prompt}
                ],
                model="llama3-70b-8192",
                temperature=0.3
            )

            return response.choices[0].message.content
        except Exception as e:
            return f"Terjadi kesalahan: {str(e)}"

# Contoh penggunaan
if __name__ == "__main__":
    # Inisialisasi sistem
    sistem = RAGSejarahIndonesia()

    # Isi basis data (hanya dijalankan pertama kali)
    if not os.path.exists("database_sejarah"):
        sistem.isi_basis_data()

    # Contoh interaksi
    while True:
        print("\nSilakan ajukan pertanyaan tentang sejarah kemerdekaan Indonesia")
        print("Contoh: 'Apa peran NICA setelah kemerdekaan?'")
        print("Ketik 'exit' untuk keluar")

        pertanyaan = input("\nPertanyaan Anda: ")

        if pertanyaan.lower() == 'exit':
            break

        jawaban = sistem.tanya(pertanyaan)
        print("\nJawaban:")
        print(jawaban)

        # Tampilkan sumber referensi
        dokumen_relevan = sistem.cari_dokumen(pertanyaan)
        print("\nSumber referensi:")
        for doc in dokumen_relevan:
            print(f"- {doc['judul']} ({doc['sumber']})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Silakan ajukan pertanyaan tentang sejarah kemerdekaan Indonesia
Contoh: 'Apa peran NICA setelah kemerdekaan?'
Ketik 'exit' untuk keluar

Pertanyaan Anda: apa peranan nica setelah kemerdekaan?

Jawaban:
Maaf, tidak menemukan informasi yang relevan.

Sumber referensi:

Silakan ajukan pertanyaan tentang sejarah kemerdekaan Indonesia
Contoh: 'Apa peran NICA setelah kemerdekaan?'
Ketik 'exit' untuk keluar

Pertanyaan Anda: exit


In [2]:
%pip install chromadb sentence-transformers groq beautifulsoup4 pypdf2

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting groq
  Downloading groq-0.30.0-py3-none-any.whl.metadata (16 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.35.0-py