In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
import pandas as pd

class LSIModel:
    def __init__(self, n_components=100):
        """
        Initialize the LSI model.
        n_components: Number of dimensions to keep in the SVD
        """
        self.n_components = n_components
        self.vectorizer = TfidfVectorizer(stop_words='english', 
                                        max_features=10000)
        
    def fit(self, documents):
        """
        Fit the LSI model to a collection of documents.
        """
        # Create term-document matrix with TF-IDF weighting
        print("Creating term-document matrix...")
        X = self.vectorizer.fit_transform(documents)
        print(f"Term-document matrix shape: {X.shape}")
        print(f"Sample of term-document matrix (first 3x3):\n{X[:3,:3].toarray()}")
        
        # Perform truncated SVD
        print("Performing SVD...")
        # Note: We transpose X to get the correct orientation
        U, Sigma, Vt = svds(X.T, k=self.n_components)
        
        print(f"U matrix shape: {U.shape}")
        print(f"Sigma diagonal values: {Sigma}")
        print(f"Vt matrix shape: {Vt.shape}")
        
        # Visualize a small portion of the matrices
        print(f"Sample of U matrix (first 3x3):\n{U[:3,:3]}")
        print(f"Sample of Vt matrix (first 3x3):\n{Vt[:3,:3]}")
        
        # Store the LSI components
        self.U = U
        self.Sigma = Sigma
        self.Vt = Vt
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        # Calculate document vectors in reduced space
        self.doc_vectors = (Vt.T * Sigma)
        print(f"Document vectors shape: {self.doc_vectors.shape}")
        print(f"Sample of document vectors (first 3x3):\n{self.doc_vectors[:3,:3]}")
        
        return self
    

    def transform_query(self, query):
        """
        Transform a text query into the LSI space.
        """
        # Convert query to TF-IDF vector
        query_vec = self.vectorizer.transform([query])
        
        # For debugging, let's print the shapes
        print(f"Query vector shape: {query_vec.shape}")
        print(f"Non-zero elements in query vector: {query_vec.nnz}")
        print(f"Query vector as array (truncated):\n{query_vec.toarray()[0][:10]}")
        print(f"U shape: {self.U.shape}")
        print(f"Sigma shape: {self.Sigma.shape}")
        
        # Project into LSI space
        # Note: U needs to be transposed for correct multiplication
        query_lsi = query_vec.toarray() @ self.U @ np.diag(1/self.Sigma)
        
        print(f"Query LSI vector shape: {query_lsi.shape}")
        print(f"Query LSI vector (truncated):\n{query_lsi[0][:5]}")
        
        return query_lsi

    
    def find_similar_books(self, query, n=5):
        """
        Find the n most similar books to a query.
        
        query: Text string representing the search query
        n: Number of similar books to return
        """
        # Transform query to LSI space
        query_lsi = self.transform_query(query)
        
        # Calculate cosine similarities
        similarities = np.dot(query_lsi, self.doc_vectors.T)
        print(f"Similarities shape: {similarities.shape}")
        print(f"All similarities: {similarities.flatten()}")
        
        # Get top n similar documents
        top_indices = np.argsort(similarities.flatten())[-n:][::-1]
        top_similarities = similarities.flatten()[top_indices]
        
        print(f"Top {n} indices: {top_indices}")
        print(f"Top {n} similarities: {top_similarities}")
        
        return top_indices, top_similarities

# Example usage:
if __name__ == "__main__":
    # Sample book data (in practice, this would be your actual book dataset)
    books = [
        "The art of computer programming algorithms and data structures",
        "Learning python programming for beginners",
        "Introduction to machine learning and artificial intelligence",
        "The great gatsby a novel of American literature",
        "Pride and prejudice a classic romance novel"
    ]
    
    print(f"Number of documents: {len(books)}")
    print("Sample documents:")
    for i, book in enumerate(books):
        print(f"  {i}: {book}")
    
    # Create and fit the LSI model
    lsi = LSIModel(n_components=2)
    lsi.fit(books)
    
    # Example query
    query = "idf idf"
    print(f"\nProcessing query: '{query}'")
    indices, similarities = lsi.find_similar_books(query)
    
    # Print results
    print("\nQuery:", query)
    print("\nMost similar books:")
    for idx, sim in zip(indices, similarities):
        print(f"Similarity: {sim:.4f} - Book: {books[idx]}")

Number of documents: 5
Sample documents:
  0: The art of computer programming algorithms and data structures
  1: Learning python programming for beginners
  2: Introduction to machine learning and artificial intelligence
  3: The great gatsby a novel of American literature
  4: Pride and prejudice a classic romance novel
Creating term-document matrix...
Term-document matrix shape: (5, 22)
Sample of term-document matrix (first 3x3):
[[0.42066906 0.         0.42066906]
 [0.         0.         0.        ]
 [0.         0.         0.        ]]
Performing SVD...
U matrix shape: (22, 2)
Sigma diagonal values: [1.06768646 1.10646849]
Vt matrix shape: (2, 5)
Sample of U matrix (first 3x3):
[[ 0.00000000e+00  1.80633601e-01]
 [-3.07094483e-01 -3.88510644e-16]
 [-8.74856979e-17  1.80633601e-01]]
Sample of Vt matrix (first 3x3):
[[-2.22044605e-16 -8.83772253e-16 -9.06507549e-16]
 [ 4.75113115e-01  7.07106781e-01  5.23705574e-01]]
Document vectors shape: (5, 2)
Sample of document vectors (first 3x

# LSI Academic Search Engine

A modular academic paper search engine using Latent Semantic Indexing (LSI) with multiple relevance-boosting enhancements. This system efficiently processes large academic paper datasets (up to 160GB) on standard hardware.

## Features

- **Field-Weighted LSI**: Enhanced document representation with field-specific weights
- **Keyword Extraction**: Automatic keyword identification using BERT-based models
- **Temporal Relevance**: Configurable recency preference for search results
- **Metadata Filtering**: Support for field-specific queries (author, year, journal)
- **Query Expansion**: Automatic enhancement of queries with relevant terms
- **Memory Efficiency**: Designed to handle large datasets on consumer hardware
- **GPU Acceleration**: Optimized for systems with GPU support

## System Architecture

![System Architecture](https://i.imgur.com/PlwWtqI.png)

### 1. Document Processor

The Document Processor handles the extraction and normalization of academic papers from JSON format.

**Key Features:**
- JSON document parsing with robust error handling
- Text normalization and tokenization
- Field extraction (title, abstract, full text)
- Efficient batch processing for large datasets
- Memory-efficient streaming interface

**Code Example:**
```python
processor = DocumentProcessor(data_dir="./data", batch_size=1000)
for batch in processor.batch_document_generator():
    # Process each batch
    print(f"Processing batch of {len(batch)} documents")
```

### 2. Indexing Engine

The Indexing Engine creates the LSI representation of documents using field-weighted TF-IDF matrices and dimensionality reduction.

**Key Features:**
- Field-weighted TF-IDF matrices (title: 3.0×, abstract: 1.5×, body: 1.0×)
- Truncated SVD for dimensionality reduction (150 dimensions)
- Memory-mapped vector storage using HDF5
- Incremental indexing capability
- Efficient sparse matrix operations

**Code Example:**
```python
engine = IndexingEngine(
    index_dir="./index",
    n_components=150,
    field_weights={'title': 3.0, 'abstract': 1.5, 'full_text': 1.0}
)
doc_vectors, term_vectors = engine.fit_transform(documents)
```

### 3. Enhancement Modules

#### 3.1 KeyBERT Keyword Extraction

Extracts key phrases from documents using BERT-based semantic representations.

**Key Features:**
- Uses lightweight BERT models optimized for keyword extraction
- Extracts multi-word phrases (1-3 words)
- Configurable number of keywords per document
- GPU-accelerated with memory optimization
- Provides relevance boosting based on keyword matching

**Code Example:**
```python
extractor = KeywordExtractor(
    index_dir="./index",
    model_name="all-MiniLM-L6-v2",  # Small but effective model
    top_n=5
)
keywords = extractor.extract_keywords(documents)
```

#### 3.2 Temporal Relevance Adjuster

Adjusts document relevance based on publication year with configurable recency preference.

**Key Features:**
- User-controllable recency preference (0.0 to 1.0)
- Normalized age-based boosting
- Pre-computed year indexing
- Configurable maximum age factor
- Year range filtering support

**Code Example:**
```python
adjuster = TemporalRelevanceAdjuster(
    index_dir="./index",
    default_recency_preference=0.3
)
results = adjuster.apply_temporal_boost(results, recency_preference=0.5)
```

### 4. Query Processor

The Query Processor handles search requests, combining LSI similarity with enhancement factors.

**Key Features:**
- Query normalization and field filter extraction
- Projection of queries into LSI space
- Cosine similarity calculation
- Integration of enhancement factors
- Field-specific filtering (year, author, journal)
- Query expansion capability

**Code Example:**
```python
processor = QueryProcessor(
    indexing_engine=indexing_engine,
    keyword_extractor=keyword_extractor,
    temporal_adjuster=temporal_adjuster
)
results = processor.search(
    query="neural networks author:\"Smith\" year:\"2015-2022\"",
    recency_preference=0.3,
    use_query_expansion=True
)
```

## Installation

```bash
# Clone repository
git clone https://github.com/username/lsi-academic-search.git
cd lsi-academic-search

# Create virtual environment
python -m venv venv
source venv/bin/activate  # On Windows: venv\Scripts\activate

# Install dependencies
pip install -r requirements.txt
```

### Requirements

```
numpy>=1.20.0
scipy>=1.6.0
scikit-learn>=0.24.0
h5py>=3.1.0
keybert>=0.5.0
sentence-transformers>=2.0.0
torch>=1.8.0
tqdm>=4.60.0
joblib>=1.0.0
```

## Usage

### Creating Sample Data (for testing)

```bash
python main_demo.py --create_samples --num_samples 100
```

### Building the Index

```bash
python main_demo.py --data_dir ./data --index_dir ./index --build_index
```

### Running Searches

```bash
# Basic search
python main_demo.py --query "machine learning neural networks"

# With adjusted recency preference (0.0-1.0)
python main_demo.py --query "information retrieval" --recency 0.7

# With field filters
python main_demo.py --query "machine learning year:\"2018-2023\" author:\"Smith\""
```

## Configuration Options

The search engine can be configured through several parameters:

| Parameter | Description | Default |
|-----------|-------------|---------|
| `lsi_components` | Number of LSI dimensions | 150 |
| `field_weights` | Weights for document fields | `{'title': 3.0, 'abstract': 1.5, 'full_text': 1.0}` |
| `batch_size` | Documents per processing batch | 1000 |
| `keybert_model` | Model for keyword extraction | "all-MiniLM-L6-v2" |
| `recency_preference` | Default temporal boosting strength | 0.3 |
| `max_results` | Maximum results returned | 100 |

## Performance Considerations

- **Memory Usage**: The system is designed to work with 16GB RAM by processing documents in batches
- **Disk Space**: Index storage requires approximately 10-15% of the original dataset size
- **GPU Acceleration**: KeyBERT extraction is significantly faster with GPU support
- **Scalability**: Can handle datasets up to 160GB with appropriate batch sizing

## Future Extensions

- **Citation Network Analysis**: Integrate citation relationships for relevance boosting
- **User Feedback Integration**: Incorporate click data to improve ranking
- **Multilingual Support**: Add cross-language search capabilities
- **Faceted Search Interface**: Develop a web UI with interactive filters
- **Incremental Updates**: Support for adding new papers without full reindexing

## License

MIT

## Acknowledgments

This project incorporates ideas from several academic papers:
- Deerwester et al. (1990) "Indexing by Latent Semantic Analysis"
- Grootendorst (2020) "KeyBERT: Minimal keyword extraction with BERT"


# Academic Papers for LSI Search Engine Project



## Markdown Table Format

| Paper Title | Authors | Overview | Relevance to LSI Search Engine Project |
|-------------|---------|----------|---------------------------------------|
| Indexing by Latent Semantic Analysis | Scott Deerwester, Susan T. Dumais, George W. Furnas, Thomas K. Landauer, Richard Harshman | This seminal paper introduces Latent Semantic Indexing (LSI) as a technique to overcome the limitations of keyword matching in information retrieval. The authors describe how singular value decomposition (SVD) can be used to uncover latent semantic structure in term-document matrices, addressing problems of synonymy and polysemy in text retrieval. | This paper provides the theoretical foundation for the entire LSI academic search engine project. The core indexing engine described in the project directly implements the SVD-based dimensionality reduction (using 150 dimensions) on the term-document matrices, as outlined in this paper. The field-weighted approach in the project extends the basic LSI model. |
| An Introduction to Latent Semantic Analysis | Thomas K. Landauer, Peter W. Foltz, Darrell Laham | This paper explains the theoretical foundations and practical applications of LSA in a more accessible manner. It covers how LSA extracts and represents the contextual-usage meaning of words through statistical computations on large text corpora. | The paper's explanation of how LSA represents both terms and documents in the same semantic space directly informs the query processing module of the search engine. The project's implementation of cosine similarity for matching queries to documents is based on principles described here. |
| Using Latent Semantic Analysis to Improve Access to Textual Information | Susan T. Dumais, George W. Furnas, Thomas K. Landauer, Scott Deerwester | This paper demonstrates practical applications of LSI in information retrieval systems. It shows how LSI can overcome vocabulary mismatch problems between queries and documents, producing more accurate and comprehensive search results. | The system architecture of the LSI academic search engine, particularly the document processor and indexing engine components, draws heavily from the practical implementation guidance in this paper. The project's TF-IDF matrices preprocessing step before applying SVD follows the approach described here. |
| Using Linear Algebra for Intelligent Information Retrieval | Michael W. Berry, Susan T. Dumais, Gavin W. O'Brien | This paper provides a comprehensive mathematical treatment of LSI, focusing on the linear algebra aspects. It details SVD implementation, updating procedures for existing LSI databases, and applications of LSI in various contexts. | The paper's discussion of SVD updating techniques directly informs the incremental indexing capability of the project's indexing engine. The memory-efficient storage using HDF5 addresses some of the computational challenges described in this paper. |
| Self-supervised Contextual Keyword and Keyphrase Retrieval with Self-Labelling | Prafull Sharma, Yingbo Li | This paper presents a novel approach for keyword and keyphrase extraction using BERT-based models and contextual features. It introduces a self-supervised method that doesn't require manual labeling of training data. | This paper directly relates to the KeyBERT keyword extraction enhancement module in the search engine. The project implements BERT-based semantic representations for extracting key phrases from documents, which follows the approach outlined in this paper. |
| Document Length Normalization | Amit Singhal, Gerard Salton, Mandar Mitra, Chris Buckley | This paper addresses the issue of document length bias in retrieval systems. It introduces pivoted cosine normalization to account for the observation that longer documents tend to have a higher probability of relevance in certain collections. | The field-weighted TF-IDF matrices in the project's indexing engine (with title weighted 3.0×, abstract 1.5×, full text 1.0×) implement a form of document length normalization that aligns with the principles discussed in this paper. |
| Information Retrieval and the Semantic Web | D.B. Mirajkar, D.G. Chougule, K.K. Awale, S.B. Sagare | This paper discusses the intersection of information retrieval and semantic web technologies. It explores how traditional IR systems can be adapted to handle semantic web documents and annotations. | While the LSI search engine project doesn't explicitly incorporate semantic web technologies, the paper's discussion of enhancing retrieval with semantic information relates to the project's enhancement modules. |

Both formats can be easily copied and pasted into your preferred editing tool. The XML format can be imported into MS Word, and the markdown table can be copied directly into most text editors or word processors.