In [7]:
from elasticsearch import Elasticsearch
import torch

In [None]:
es = Elasticsearch("http://localhost:9200")

index_name = "  "

mapping = {
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "doc_id": {"type": "keyword"},
            "text": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Hapus jika sudah ada
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'doc-index'})

In [9]:
from sentence_transformers import SentenceTransformer

# Load MiniLM model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # Langsung encode teks ke dalam bentuk vektor
    embedding = model.encode(text)
    return embedding


In [10]:
import pandas as pd

dataset = pd.read_csv("quora_dataset.csv")

dataset = dataset.head(10000)

docs = [{"doc_id": str(row["doc_id"]), "text": row["text"]} for _, row in dataset.iterrows()]

docs 


[{'doc_id': '1',
  'text': 'What is the step by step guide to invest in share market in india?'},
 {'doc_id': '2',
  'text': 'What is the step by step guide to invest in share market?'},
 {'doc_id': '3',
  'text': 'What is the story of Kohinoor (Koh-i-Noor) Diamond?'},
 {'doc_id': '4',
  'text': 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'},
 {'doc_id': '5',
  'text': 'How can I increase the speed of my internet connection while using a VPN?'},
 {'doc_id': '6',
  'text': 'How can Internet speed be increased by hacking through DNS?'},
 {'doc_id': '7', 'text': 'Why am I mentally very lonely? How can I solve it?'},
 {'doc_id': '8',
  'text': 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'},
 {'doc_id': '9',
  'text': 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?'},
 {'doc_id': '10', 'text': 'Which fish would survive in salt water?'},
 {'doc_id': '11',
  'text': 'Astrology: I am a Capricorn Su

In [11]:
from tqdm import tqdm

for doc in tqdm(docs, desc="Indexing documents"):
    embedding = get_embedding(doc["text"])

    # Index ke Elasticsearch
    es.index(
        index=index_name,
        id=doc["doc_id"],
        document={
            "doc_id": doc["doc_id"],
            "text": doc["text"],
            "embedding": embedding.tolist()
        }
    )


Indexing documents: 100%|██████████| 10000/10000 [10:40<00:00, 15.60it/s]


In [20]:
query_text = "bitcoin"
query_embedding = get_embedding(query_text)

query = {
    "size": 5,
    "query": {
        "script_score": {
            "query": {
                "match": {
                    "text": query_text
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + _score",
                "params": {
                    "query_vector": query_embedding.tolist()  # Vector query dalam bentuk list
                }
            }
        }
    }
}

response = es.search(index=index_name, body=query)

for hit in response["hits"]["hits"]:
    doc_id = hit["_source"]["doc_id"]
    score = hit["_score"]
    text = hit["_source"]["text"]
    print(f"Doc ID: {doc_id}\nScore: {score}\nText: {text}\n{'-'*50}")



Doc ID: 8044
Score: 9.793897
Text: How can Bitcoin be hacked?
--------------------------------------------------
Doc ID: 8045
Score: 8.89599
Text: Is Bitcoin mining still profitable in 2016?
--------------------------------------------------
Doc ID: 4733
Score: 7.923117
Text: What is an intuitive explanation of how bitcoin mining works?
--------------------------------------------------
Doc ID: 715
Score: 7.5339375
Text: Could all bitcoin mining be done by a single Raspberry Pi?
--------------------------------------------------
Doc ID: 1822
Score: 7.3731337
Text: What is the risk of selling bitcoin and accepting credit card payment?
--------------------------------------------------
