In [None]:
!pip install nltk faiss-cpu sentence-transformers transformers datasets


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "Cornell-University/arxiv",
    split="train[:200]"
)

print(dataset[0].keys())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetNotFoundError: Dataset 'Cornell-University/arxiv' doesn't exist on the Hub or cannot be accessed.

In [None]:
paper_texts = []

for paper in dataset:
    if paper["categories"].startswith("cs"):
        combined_text = paper["title"] + " " + paper["abstract"]
        paper_texts.append(preprocess_text(combined_text))

print("Total CS papers:", len(paper_texts))


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(paper_texts)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("Papers indexed:", index.ntotal)


In [None]:
def retrieve_paper(query):
    query = preprocess_text(query)
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), 1)
    return paper_texts[indices[0][0]]


In [None]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

def summarize_text(text):
    summary = summarizer(text, max_length=120, min_length=40, do_sample=False)
    return summary[0]["summary_text"]


In [None]:
def arxiv_expert_chatbot(query):
    paper = retrieve_paper(query)
    summary = summarize_text(paper[:1024])
    return summary


In [None]:
query = "Explain neural networks in computer science"
response = arxiv_expert_chatbot(query)

print("User:", query)
print("Expert Bot:", response)
