In [1]:
!pip install faiss-cpu



In [2]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import pandas as pd

In [3]:
# def load_data(json_file):
#     try:
#         with open(json_file, 'r') as file:
#             data = [json.loads(line.strip()) for line in file]  # Line-by-line JSON parsing
#         return pd.DataFrame(data)
#     except json.JSONDecodeError as e:
#         print(f"JSONDecodeError: {e}")
#         return pd.DataFrame()  # Return an empty DataFrame in case of an error

# # Step 2: Preprocess Data
# def preprocess_data(df):
#     df["Content"] = df["Title"] + ". " + df["Abstract"] + ". " + df["Keywords"]
#     return df

# Step 1: Load the model


In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Step 2: Data Preparation

In [5]:
arxiv_documents = [
    {"Paper ID": "arXiv:2101.00001", "Title": "Transformers in NLP", "Abstract": "This paper reviews the role of transformers in natural language processing.", "Keywords": "Transformers, NLP, Deep Learning"},
    {"Paper ID": "arXiv:2101.00002", "Title": "Quantum Computing Basics", "Abstract": "An introduction to the principles of quantum computing and its potential applications.", "Keywords": "Quantum Computing, Physics, Technology"},
    {"Paper ID": "arXiv:2101.00003", "Title": "Climate Change Impacts", "Abstract": "Examining the effects of climate change on global ecosystems and weather patterns.", "Keywords": "Climate Change, Environment, Ecosystems"},
    {"Paper ID": "arXiv:2101.00004", "Title": "AI in Healthcare", "Abstract": "Exploring the applications of artificial intelligence in healthcare for diagnosis and treatment.", "Keywords": "AI, Healthcare, Diagnosis"},
    {"Paper ID": "arXiv:2101.00005", "Title": "Advances in Computer Vision", "Abstract": "A survey of recent advancements in computer vision techniques and applications.", "Keywords": "Computer Vision, AI, Image Processing"}
]

# Convert to DataFrame for Processing


In [6]:
def get_arxiv_sample_data():
    return pd.DataFrame(arxiv_documents)

# Generate Embeddings and Create FAISS Index


In [7]:
def build_faiss_index(documents, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    # Combine Title and Abstract for Embedding
    contents = [f"{doc['Title']}. {doc['Abstract']}" for doc in documents]
    doc_embeddings = model.encode(contents, convert_to_numpy=True)

    # Create FAISS Index
    dimension = doc_embeddings.shape[1]  # Embedding dimension
    index = faiss.IndexFlatL2(dimension)  # L2 similarity
    index.add(doc_embeddings)

    return index, model

index, model = build_faiss_index(arxiv_documents)

# Perform Semantic Search

In [8]:
def semantic_search(query, model, index, documents, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {
            "Paper ID": documents[i]["Paper ID"],
            "Title": documents[i]["Title"],
            "Abstract": documents[i]["Abstract"],
            "Distance": distances[0][j]
        }
        for j, i in enumerate(indices[0])
    ]
    return results

# Example Query

In [9]:
query = "Applications of AI in medicine"
results = semantic_search(query, model, index, arxiv_documents)

# Display Results

In [10]:
print("Query:", query)
print("Top Results:")
for result in results:
    print(f" - {result['Title']} (Distance: {result['Distance']:.4f})")

Query: Applications of AI in medicine
Top Results:
 - AI in Healthcare (Distance: 0.4330)
 - Quantum Computing Basics (Distance: 1.6562)
 - Advances in Computer Vision (Distance: 1.7169)
