## Dependencies

In [1]:
import pandas as pd
from collections import Counter
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from sentence_transformers import SentenceTransformer, util
import torch

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to C:\Users\MSI-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\MSI-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\MSI-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\MSI-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Text Processing

In [2]:
def preprocess_text(text):
    if not isinstance(text, str):  # Handle cases where text is NaN or not a string
        return ''
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [3]:
data = pd.read_csv('updated500k_arXivDataset.csv')
data.tail(4)

Unnamed: 0,title,processed_title
499996,s t bipartite graphs,bipartite graph
499997,the odinus mission concept the scientific case...,odinus mission concept scientific case mission...
499998,the simplified topological algorithms for acce...,simplified topological algorithm accelerating ...
499999,introducing quantified cuts in logic with equa...,introducing quantified cut logic equality


In [4]:
# Drop rows with empty cleaned titles
data = data[data['processed_title'] != '']

## Model setup

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# save sentence bert model
model.save('saved_model/sbert_model') 

In [None]:
# Generate embeddings for all cleaned titles
title_embeddings = model.encode(data['processed_title'].tolist(), convert_to_tensor=True)

In [None]:
# Save the title embeddings
torch.save(title_embeddings, 'saved_model/title_embeddings_v2.pt')

## User input

In [5]:
device = torch.device("cpu")
model = SentenceTransformer('saved_model/sbert_model', device=device)
title_embeddings = torch.load('saved_model/title_embeddings_v2.pt', map_location=device)

  title_embeddings = torch.load('saved_model/title_embeddings_v2.pt', map_location=device)


In [6]:
def search_titles(user_query, top_k=5):
    """
    Searches for the most similar titles to the user's query.
    """
    # Check for invalid input
    if not user_query or not isinstance(user_query, str):
        return [("Invalid query. Please provide a valid string.", 0)]

    # Preprocess the query
    processed_query = preprocess_text(user_query)

    # Check if preprocessing results in an empty string
    if not processed_query:
        return [("The query doesn't contain meaningful content after preprocessing.", 0)]

    # Generate embedding for the query
    query_embedding = model.encode(processed_query, convert_to_tensor=True)

    # Compute similarity scores
    similarity_scores = util.cos_sim(query_embedding, title_embeddings)

    # Combine original titles, processed titles, and scores
    combined_results = list(zip(data['title'], data['processed_title'], similarity_scores.squeeze().tolist()))

    # Sort results based on similarity scores
    sorted_results = sorted(combined_results, key=lambda x: x[2], reverse=True)[:top_k]

    # Return only the original title and similarity score
    return [(original_title, score) for original_title, processed_title, score in sorted_results]


In [7]:
# Example queries
test_queries = [
    "Introduction to Artificial Intelligence",
]

# Test the function
for query in test_queries:
    print(f"Query: {query}")
    results = search_titles(query, top_k=5)
    for title, score in results:
        print(f"  Title: {title}, Similarity Score: {score:.4f}")
    print()

Query: Introduction to Artificial Intelligence
  Title: artificial intelligence in humans, Similarity Score: 0.6449
  Title: an introduction to automata, Similarity Score: 0.6250
  Title: second order swarm intelligence, Similarity Score: 0.5871
  Title: a definition of artificial intelligence, Similarity Score: 0.5780
  Title: swarm intelligence, Similarity Score: 0.5738

