In [39]:
import pandas as pd

In [40]:
data = pd.read_csv("../xtract-api/DataSet/arxiv_processed.csv")
data.head()

Unnamed: 0,id,title,abstract,authors,category_code,update_date,clean_title,clean_abstract,category
0,acc-phys/9607001,An Investigation of Stochastic Cooling in the ...,This report provides a description of unbunc...,O. Meincke,acc-phys physics.acc-ph,2008-02-03,an investigation of stochastic cooling in the ...,this report provides a description of unbunche...,"Accelerator Physics, Physics – Accelerator Phy..."
1,acc-phys/9601001,Particle Motion in the Stable Region Near an E...,This paper studies the particle motion when ...,G. Parzen (Brookhaven National Laboratory),acc-phys physics.acc-ph,2008-02-03,particle motion in the stable region near an e...,this paper studies the particle motion when th...,"Accelerator Physics, Physics – Accelerator Phy..."
2,acc-phys/9602001,Muon Colliders,Muon Colliders have unique technical and phy...,"R. B. Palmer(BNL), A. Sessler(LBNL), A. Skrins...",acc-phys physics.acc-ph,2012-08-29,muon colliders,muon colliders have unique technical and physi...,"Accelerator Physics, Physics – Accelerator Phy..."
3,adap-org/9306005,Prediction and Adaptation in an Evolving Chaot...,We describe the results of analytic calculat...,"Alfred H\""ubler and David Pines (Santa Fe Inst...",adap-org chao-dyn nlin.AO nlin.CD,2008-02-03,prediction and adaptation in an evolving chaot...,we describe the results of analytic calculatio...,"Adaptation, Noise, and Self-Organizing Systems..."
4,chao-dyn/9407001,Pattern Dynamics of a Coupled Map Lattice for ...,The pattern dynamics of the one-way coupled ...,Frederick H. Willeboordse (University of Tokyo...,chao-dyn adap-org nlin.AO nlin.CD nlin.PS patt...,2015-06-24,pattern dynamics of a coupled map lattice for ...,the pattern dynamics of the one way coupled lo...,"Adaptation, Noise, and Self-Organizing Systems..."


### Text Preprocessing (combine = Title + abstract)

In [41]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [42]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
import re
import numpy as np

In [44]:
def preprocess_text(text):
    """Enhanced text preprocessing"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

data['combined_text'] = (data['clean_title'] + ' ' + data['clean_abstract']).apply(preprocess_text)


### Text Vectorization

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)
tfidf_matrix = tfidf.fit_transform(data['combined_text'])

In [46]:
# Category encoding (if you want to use categories)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
category_encoded = label_encoder.fit_transform(data['category'])

In [47]:
from scipy.sparse import hstack

# Combine TF-IDF with category features
category_matrix = np.array(category_encoded).reshape(-1, 1)
feature_matrix = hstack([tfidf_matrix, category_matrix])

print(f"Feature matrix shape: {feature_matrix.shape}")

Feature matrix shape: (194065, 10001)


### Compute similarity

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

# Create mapping between paper IDs and indices
paper_id_to_idx = {paper_id: idx for idx, paper_id in enumerate(data['id'])}
idx_to_paper_id = {idx: paper_id for paper_id, idx in paper_id_to_idx.items()}

# Convert to CSR format for efficient row indexing
feature_matrix_csr = feature_matrix.tocsr()

### Recommendation Function

In [49]:
def compute_similarity_for_index(paper_id, feature_matrix, top_k=10):
    """Compute similarity for a specific paper using paper_id"""
    # Convert paper_id to the correct type if needed
   
    
    if paper_id not in paper_id_to_idx:
        # Try alternative formats
        try:
            paper_id_float = float(paper_id)
            if paper_id_float in paper_id_to_idx:
                idx = paper_id_to_idx[paper_id_float]
            else:
                return None  # Paper not found
        except (ValueError, TypeError):
            return None  # Paper not found
    else:
        idx = paper_id_to_idx[paper_id]
    
    query_vector = feature_matrix[idx]
    similarities = cosine_similarity(query_vector, feature_matrix)
    return similarities.flatten()

def get_recommendations_on_the_fly(paper_id, top_k=10):
    """Get recommendations by computing similarity on-the-fly"""
    # Convert paper_id to string for consistency
    
    if paper_id not in paper_id_to_idx:
        # Try alternative formats
        try:
            paper_id_float = float(paper_id)
            if paper_id_float not in paper_id_to_idx:
                return f"Paper ID {paper_id} not found. Please check the paper ID."
            else:
                idx = paper_id_to_idx[paper_id_float]
        except (ValueError, TypeError):
            return f"Paper ID {paper_id} not found. Please check the paper ID."
    else:
        idx = paper_id_to_idx[paper_id]
    
    # Compute similarities only for this specific paper
    sim_scores = compute_similarity_for_index(paper_id, feature_matrix_csr)
    
    if sim_scores is None:
        return f"Paper ID {paper_id} not found. Please check the paper ID."
    
    # Create list of (index, score) pairs
    sim_scores = list(enumerate(sim_scores))
    
    # Sort by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top_k most similar papers (excluding the paper itself)
    sim_scores = sim_scores[1:top_k+1]
    
    # Extract indices and scores
    paper_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]
    
    # Return recommendations
    recommendations = data.iloc[paper_indices][['id', 'title', 'authors', 'category']].copy()
    recommendations['similarity_score'] = similarity_scores
    
    return recommendations

In [50]:
# Test the recommendation systems
print("\nOn-the-fly Recommendations for paper 704.0033:")
on_the_fly_recommendations = get_recommendations_on_the_fly('704.0033', top_k=5)
print(on_the_fly_recommendations)


On-the-fly Recommendations for paper 704.0033:
                      id                                              title  \
103673  cond-mat/0412039          Reply to Bernevig, Giuliano, and Laughlin   
148072        2409.15709                                     $R(5,5)\le 46$   
148957        1703.08768                                    $R(5,5) \le 48$   
189731           810.542  The effect of dipole-dipole interaction for tw...   
191129         1103.1442                  Two-photon dipole-dipole blockade   

                                                  authors  \
103673                  Martin Greiter and Dirk Schuricht   
148072            Vigleik Angeltveit and Brendan D. McKay   
148957            Vigleik Angeltveit and Brendan D. McKay   
189731                  Yang Li, Jiang Zhou, and Hong Guo   
191129  Khulud Almutairi, Ryszard Tanas, and Zbigniew ...   

                                                category  similarity_score  
103673  Condensed Matter – Strong