In [18]:
import pandas as pd


df = pd.read_json("hf://datasets/toughdata/quora-question-answer-dataset/Quora-QuAD.jsonl", lines=True)

In [19]:
df = df.head(10000)

In [21]:
df["doc_id"] = [str(uuid.uuid4()) for _ in range(len(df))]
df

Unnamed: 0,question,answer,doc_id
0,Why whenever I get in the shower my girlfriend...,Isn‚Äôt it awful? You would swear that there was...,39fadfa4-02e1-453c-87ec-9f75e94e2757
1,"What is a proxy, and how can I use one?",A proxy server is a system or router that prov...,20c9156a-cd78-434d-8ffc-0e1b07b7d63d
2,"What song has the lyrics ""someone left the cak...",MacArthur's Park\n,15f8b621-6595-44fd-a2cb-41bb38ea31fa
3,I am the owner of an adult website called http...,Don't let apps that are liers put adds on your...,b138477a-7372-46e2-8075-b95cfda539bc
4,Does the Bible mention anything about a place ...,St. John in the book of Revelation mentions an...,8949c697-690c-4c83-b3a9-ff612818f213
...,...,...,...
9995,Are there any real differences between covert ...,"In my opinion, it's the type of facade and rep...",dee9281b-5f8e-43d0-ba08-08fe4d2e60fc
9996,How do I get a mentor for usmle?,First of all check out [LINKED_TEXT: medpox] [...,dec9bdbd-4214-4d44-b793-3d57b7194b60
9997,"Are teachers who tell students that ""math is r...",Yes. They are.\n,a1b87ad9-af5b-4709-a677-52c4c37ae17d
9998,Which cryptocurrency has the lowest transactio...,Which Cryptocurrencies Have the Lowest Transac...,17a3dafa-6481-4aa6-add1-6e4657ee4d93


In [24]:
docs = [{"doc_id": str(row["doc_id"]), "question": row["question"], "answer": str(row["answer"])} for _, row in df.iterrows()]

docs 


[{'doc_id': '39fadfa4-02e1-453c-87ec-9f75e94e2757',
  'question': 'Why whenever I get in the shower my girlfriend want to join?',
  'answer': 'Isn‚Äôt it awful? You would swear that there wasn‚Äôt enough hot water to go around!\n'},
 {'doc_id': '20c9156a-cd78-434d-8ffc-0e1b07b7d63d',
  'question': 'What is a proxy, and how can I use one?',
  'answer': 'A proxy server is a system or router that provides a gateway between users and the internet. Therefore, it helps prevent cyber attackers from entering a private network. It is a server, referred to as an ‚Äúintermediary‚Äù because it goes between end-users and the web pages they visit online.\n When a computer connects to the internet, it uses an IP address. This is similar to your home‚Äôs street address, telling incoming data where to go and marking outgoing data with a return address for other devices to authenticate. A proxy server is essentially a computer on the internet that has an IP address of its own.\n How a Proxy Works\nBecau

In [51]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import uuid

indices = es.cat.indices(format="json")
for idx in indices:
    print(f"Index: {idx['index']}, Health: {idx['health']}, Docs Count: {idx['docs.count']}")

Index: read_me, Health: yellow, Docs Count: 1


In [52]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import uuid

# Koneksi ke Elasticsearch
es = Elasticsearch(
    "http://143.198.220.249:9200",
)

index_name = "my-index"

# Mapping dengan question dan answer
mapping = {
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "doc_id": {"type": "keyword"},
            "question": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "answer": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}


# Hapus index jika sudah ada
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Buat index baru
es.indices.create(index=index_name, body=mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-index'})

In [53]:
from sentence_transformers import SentenceTransformer

# Load MiniLM model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # Langsung encode teks ke dalam bentuk vektor
    embedding = model.encode(text)
    return embedding


In [54]:
from tqdm import tqdm

for doc in tqdm(docs, desc="Indexing documents"):
    # Dapatkan embedding dari pertanyaan
    embedding = get_embedding(doc["question"])

    es.index(
        index=index_name,
        id=doc["doc_id"],
        document={
            "doc_id": doc["doc_id"],
            "question": doc["question"],
            "answer": doc["answer"],
            "embedding": embedding.tolist()
        }
    )


Indexing documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [05:57<00:00, 27.94it/s]


In [55]:
from collections import defaultdict

query_text = "sex"
query_embedding = get_embedding(query_text)

query = {
    "size": 20,  # Naikkan size kalau ingin hasil lebih banyak
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + _score",
                "params": {
                    "query_vector": query_embedding.tolist()
                }
            }
        }
    }
}

response = es.search(index=index_name, body=query)

# Dictionary untuk mengelompokkan jawaban berdasarkan pertanyaan
grouped_results = defaultdict(list)

# Kumpulkan semua hasil berdasarkan question
for hit in response["hits"]["hits"]:
    question = hit["_source"].get("question", "Unknown question")
    answer = hit["_source"].get("answer", "No answer")
    grouped_results[question].append(answer.strip())

# Tampilkan hasil gabungan
for question, answers in grouped_results.items():
    print(f"Question: {question}")
    print("Answers:")
    for ans in answers:
        print(f"- {ans}")
    print("-" * 60)


Question: Other than cheating, what's the biggest betrayal in a relationship?
Answers:
- 1. Pretending to be someone you‚Äôre not or lying about who you are as a way to accomodate to your partners wishes, wants and desires and to gain their approval.
 It‚Äôs betrayal because in doing this, people are putting on a mask and essentially scam their partner.
 It‚Äôs only a matter of time until the truth about them is revealed. That‚Äôs when their partner leaves.
 A more powerful way to go about this is:
Learning how to be grounded in your own validation/frame.Accepting that we‚Äôre not meant to be friends or partners with everyone.Overcoming the fear of rejection and accepting that when someone you want to be with doesn‚Äôt want to be with you, you got to let them go and instead be open to attract partners who love and feel wild sexually about the real and authentic you.Breaking patterns of codependency & neediness and learning how to be interdependent.Building a relationship with someone w