In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')
df_test = pd.read_csv('test.csv') 
def to_qa(row):
    distractors = [row['distractor1'], row['distractor2'], row['distractor3']]
    correct = row['correct_answer']
    return {
        "question": row['question'],
        "distractor1": distractors[0],
        "distractor2": distractors[1],
        "distractor3": distractors[2],
        "correct_answer": correct,
        "support": row['support'] if 'support' in row else "" 
    }
df_train_processed = df_train.apply(to_qa, axis=1)
df_valid_processed = df_valid.apply(to_qa, axis=1)
df_test_processed = df_test.apply(to_qa, axis=1) 
df_train = pd.DataFrame(df_train_processed.tolist())
df_valid = pd.DataFrame(df_valid_processed.tolist())
df_test = pd.DataFrame(df_test_processed.tolist())  


In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip() 
    return text
df_train["Clean_Question"] = df_train["question"].apply(clean_text)
df_valid["Clean_Question"] = df_valid["question"].apply(clean_text)
df_test["Clean_Question"] = df_test["question"].apply(clean_text) 


In [None]:

from huggingface_hub import login
login(token="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")



In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
train_embeddings = model.encode(df_train["Clean_Question"].tolist(), show_progress_bar=True)
valid_embeddings = model.encode(df_valid["Clean_Question"].tolist(), show_progress_bar=True)
test_embeddings = model.encode(df_test["Clean_Question"].tolist(), show_progress_bar=True)  # For test dataset


Batches:   0%|          | 0/365 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [36]:
import numpy as np
np.save("train_embeddings.npy", train_embeddings)
np.save("valid_embeddings.npy", valid_embeddings)
np.save("test_embeddings.npy", test_embeddings)  


In [37]:
import faiss
dim = train_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(train_embeddings)
faiss.write_index(index, "faiss_sciq_index.faiss")


In [None]:
def get_similar_questions(query, model, index, df, k=5):
    q_clean = clean_text(query) 
    q_emb = model.encode([q_clean])  
    distances, idxs = index.search(q_emb, k) 
    for i, (dist, idx) in enumerate(zip(distances[0], idxs[0]), start=1):
        if idx < len(df):  
            row = df.iloc[idx]
            print(f"\nResult {i} (dist={dist:.4f}):")
            print("Q:", row["question"])
            print("A:", row["correct_answer"])
            print("Distractors:", row["distractor1"], ",", row["distractor2"], ",", row["distractor3"])
        else:
            print(f"Warning: Index {idx} out of bounds for DataFrame.")


In [None]:
if __name__ == "__main__":
    print("=== SciQ Chatbot (type 'exit' to quit) ===")
    while True:
        user_q = input("\nYour question: ").strip()
        if user_q.lower() in ("exit", "quit"):
            break
        get_similar_questions(user_q, model, index, df_test, k=5)  
    print("Goodbye!")