In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, ndcg_score
import pandas as pd

class PointwiseRankingModel:
    def __init__(self):
        self.model = LinearRegression()
        self.scaler = StandardScaler()
        
    def prepare_data(self, features):
        """
        Prepare features by scaling them to have zero mean and unit variance
        
        Parameters:
        - features: Document features matrix (n_samples, n_features)
        
        Returns:
        - Scaled features matrix
        """
        return self.scaler.fit_transform(features)
    
    def fit(self, features, relevance_scores):
        """
        Train the ranking model
        
        Parameters:
        - features: Document features matrix (n_samples, n_features)
        - relevance_scores: Target relevance scores (n_samples,)
        """
        X_scaled = self.prepare_data(features)
        self.model.fit(X_scaled, relevance_scores)
        
    def predict(self, features):
        """
        Predict relevance scores for new documents
        """
        X_scaled = self.scaler.transform(features)
        return self.model.predict(X_scaled)
    
    def rank_documents(self, features, return_scores=False):
        """
        Rank documents based on their predicted relevance scores
        
        Parameters:
        - features: Document features matrix
        - return_scores: If True, return both rankings and scores
        
        Returns:
        - Document rankings (and scores if return_scores=True)
        """
        scores = self.predict(features)
        rankings = np.argsort(-scores)  # Sort in descending order
        
        if return_scores:
            return rankings, scores
        return rankings

def evaluate_ranking(y_true, y_pred, k=None):
    """
    Evaluate ranking performance using NDCG
    """
    # Reshape relevance scores for sklearn's ndcg_score
    y_true = np.array([y_true])
    y_pred = np.array([y_pred])
    
    if k is None:
        return ndcg_score(y_true, y_pred)
    return ndcg_score(y_true, y_pred, k=k)

# Example usage
if __name__ == "__main__":
    # Generate synthetic dataset
    np.random.seed(42)
    n_samples = 1000
    n_features = 5
    
    # Create synthetic features (e.g., text length, freshness, popularity, etc.)
    features = np.random.randn(n_samples, n_features)
    
    # Generate synthetic relevance scores (0 to 4, where 4 is most relevant)
    # We'll create these using a linear combination of features plus noise
    true_weights = np.array([0.6, 0.3, 0.2, 0.1, -0.1])
    relevance_scores = np.dot(features, true_weights)
    relevance_scores += np.random.normal(0, 0.1, n_samples)  # Add noise
    relevance_scores = np.clip(relevance_scores, 0, 4)  # Clip to valid range
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, relevance_scores, test_size=0.2, random_state=42
    )
    
    # Train the model
    ranker = PointwiseRankingModel()
    ranker.fit(X_train, y_train)
    
    # Make predictions on test set
    y_pred = ranker.predict(X_test)
    
    # Evaluate the model
    ndcg_5 = evaluate_ranking(y_test, y_pred, k=5)
    ndcg_10 = evaluate_ranking(y_test, y_pred, k=10)
    ndcg_full = evaluate_ranking(y_test, y_pred)
    
    print(f"NDCG@5: {ndcg_5:.3f}")
    print(f"NDCG@10: {ndcg_10:.3f}")
    print(f"NDCG (full): {ndcg_full:.3f}")
    
    # Example: Rank new documents
    new_documents = np.random.randn(5, n_features)  # 5 new documents
    rankings, scores = ranker.rank_documents(new_documents, return_scores=True)
    
    print("\nExample rankings for new documents:")
    results = pd.DataFrame({
        'Document_ID': range(len(rankings)),
        'Rank': np.argsort(rankings) + 1,
        'Predicted_Score': scores
    })
    print(results.sort_values('Rank'))

NDCG@5: 1.000
NDCG@10: 0.999
NDCG (full): 0.997

Example rankings for new documents:
   Document_ID  Rank  Predicted_Score
4            4     1         0.657692
1            1     2         0.438163
2            2     3         0.396746
3            3     4         0.350979
0            0     5        -0.229912


In [3]:
import numpy as np
from collections import Counter, defaultdict
import math
from typing import List, Dict
import re

class BM25:
    def __init__(self, k1=1.5, b=0.75):
        """
        Initialize BM25 with hyperparameters.
        
        Args:
            k1: Term frequency saturation parameter (default: 1.5)
            b: Length normalization parameter (default: 0.75)
        """
        self.k1 = k1
        self.b = b
        
        # Initialize required variables
        self.doc_freqs = defaultdict(int)  # df values for each term
        self.idf = defaultdict(float)      # idf values for each term
        self.doc_lens = []                 # length of each document
        self.avgdl = 0                     # average document length
        self.total_docs = 0                # total number of documents
        self.doc_terms = []                # terms in each document
        
    def preprocess(self, text: str) -> List[str]:
        """
        Preprocess text by converting to lowercase and splitting into terms.
        
        Args:
            text: Input text string
        
        Returns:
            List of preprocessed terms
        """
        # Convert to lowercase and split into terms
        text = text.lower()
        # Remove special characters and split into terms
        terms = re.findall(r'\w+', text)
        return terms
    
    def fit(self, documents: List[str]):
        """
        Fit BM25 parameters on a collection of documents.
        
        Args:
            documents: List of document strings
        """
        self.total_docs = len(documents)
        
        # Process each document
        for document in documents:
            terms = self.preprocess(document)
            self.doc_lens.append(len(terms))
            
            # Count term frequencies in document
            term_freqs = Counter(terms)
            self.doc_terms.append(term_freqs)
            
            # Update document frequencies
            for term in set(terms):
                self.doc_freqs[term] += 1
        
        # Calculate average document length
        self.avgdl = sum(self.doc_lens) / self.total_docs
        
        # Calculate IDF for each term
        for term, df in self.doc_freqs.items():
            idf = math.log((self.total_docs - df + 0.5) / (df + 0.5) + 1.0)
            self.idf[term] = idf
    
    def get_score(self, query: str, doc_idx: int) -> float:
        """
        Calculate BM25 score for a query and document.
        
        Args:
            query: Query string
            doc_idx: Index of the document
        
        Returns:
            BM25 score
        """
        score = 0.0
        query_terms = self.preprocess(query)
        doc_terms = self.doc_terms[doc_idx]
        doc_len = self.doc_lens[doc_idx]
        
        # Calculate score for each query term
        for term in query_terms:
            if term not in self.idf:
                continue
                
            # Get term frequency in document
            tf = doc_terms.get(term, 0)
            
            # Calculate normalized term frequency
            numerator = tf * (self.k1 + 1)
            denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
            
            # Add contribution of current term to score
            score += self.idf[term] * (numerator / denominator)
            
        return score
    
    def get_top_n(self, query: str, documents: List[str], n: int = 5) -> List[tuple]:
        """
        Get top N documents for a query.
        
        Args:
            query: Query string
            documents: List of documents
            n: Number of top documents to return
        
        Returns:
            List of (document_index, score) tuples
        """
        scores = []
        for idx in range(self.total_docs):
            score = self.get_score(query, idx)
            scores.append((idx, score))
        
        # Sort by score in descending order
        return sorted(scores, key=lambda x: x[1], reverse=True)[:n]

# Example usage
if __name__ == "__main__":
    # Sample documents
    documents = [
        "The quick brown fox jumps over the lazy dog",
        "Quick brown foxes jump over lazy dogs",
        "The lazy dog sleeps in the sun",
        "A quick brown dog runs in the park",
        "Foxes and dogs play in the garden"
    ]
    
    # Initialize and fit BM25
    bm25 = BM25(k1=1.5, b=0.75)
    bm25.fit(documents)
    
    # Example queries
    queries = [
        "quick brown fox",
        "lazy dog sleeps",
        "foxes and dogs"
    ]
    
    # Get top results for each query
    print("Top matching documents for queries:\n")
    for query in queries:
        print(f"Query: '{query}'")
        top_docs = bm25.get_top_n(query, documents, n=3)
        for doc_id, score in top_docs:
            print(f"Document {doc_id}: '{documents[doc_id]}' (Score: {score:.3f})")
        print()

Top matching documents for queries:

Query: 'quick brown fox'
Document 0: 'The quick brown fox jumps over the lazy dog' (Score: 2.276)
Document 1: 'Quick brown foxes jump over lazy dogs' (Score: 1.118)
Document 3: 'A quick brown dog runs in the park' (Score: 1.053)

Query: 'lazy dog sleeps'
Document 2: 'The lazy dog sleeps in the sun' (Score: 2.555)
Document 0: 'The quick brown fox jumps over the lazy dog' (Score: 0.995)
Document 1: 'Quick brown foxes jump over lazy dogs' (Score: 0.559)

Query: 'foxes and dogs'
Document 4: 'Foxes and dogs play in the garden' (Score: 3.253)
Document 1: 'Quick brown foxes jump over lazy dogs' (Score: 1.815)
Document 0: 'The quick brown fox jumps over the lazy dog' (Score: 0.000)



In [None]:
import numpy as np
from collections import Counter
from sklearn.preprocessing import normalize
from typing import List, Dict
import re
from scipy.spatial.distance import cosine


class TFIDFVectorizer:
    def __init__(self, use_idf=True, norm="l2"):
        """
        Initialize TF-IDF Vectorizer

        Args:
            use_idf: Whether to use IDF weighting
            norm: Normalization method ('l1', 'l2', or None)
        """
        self.vocabulary = {}
        self.doc_freq = None
        self.use_idf = use_idf
        self.norm = norm
        self.n_docs = 0
        self.idf = None

    def preprocess(self, text: str) -> List[str]:
        """Preprocess text into tokens"""
        # Convert to lowercase and split into terms
        text = text.lower()
        terms = re.findall(r"\w+", text)
        return terms

    def fit(self, documents: List[str]):
        """
        Fit vectorizer to a collection of documents

        Args:
            documents: List of document strings
        """
        # Build vocabulary
        term_doc_freq = Counter()
        for doc in documents:
            terms = self.preprocess(doc)
            # Add unique terms from this document
            term_doc_freq.update(set(terms))

        # Create vocabulary with term indices
        self.vocabulary = {
            term: idx for idx, (term, _) in enumerate(term_doc_freq.most_common())
        }

        # Calculate IDF
        self.n_docs = len(documents)
        if self.use_idf:
            self.idf = np.zeros(len(self.vocabulary))
            for term, idx in self.vocabulary.items():
                self.idf[idx] = np.log(self.n_docs / term_doc_freq[term] + 1)

    def transform(self, documents: List[str]) -> np.ndarray:
        """
        Transform documents to TF-IDF matrix

        Args:
            documents: List of document strings

        Returns:
            Document-term matrix with TF-IDF weights
        """
        # Initialize document-term matrix
        X = np.zeros((len(documents), len(self.vocabulary)))

        # Fill in TF values
        for doc_idx, doc in enumerate(documents):
            terms = self.preprocess(doc)
            term_freq = Counter(terms)

            for term, freq in term_freq.items():
                if term in self.vocabulary:
                    term_idx = self.vocabulary[term]
                    X[doc_idx, term_idx] = freq

        # Apply IDF weights
        if self.use_idf:
            X = X * self.idf

        # Apply normalization
        if self.norm:
            X = normalize(X, norm=self.norm, axis=1)

        return X

    def fit_transform(self, documents: List[str]) -> np.ndarray:
        """Convenience method to fit and transform in one step"""
        self.fit(documents)
        return self.transform(documents)

    def get_feature_names(self) -> List[str]:
        """Return list of terms in vocabulary"""
        sorted_vocab = sorted(self.vocabulary.items(), key=lambda x: x[1])
        return [term for term, _ in sorted_vocab]


class VectorSpaceModel:
    def __init__(self, use_idf=True, norm="l2"):
        """
        Initialize Vector Space Model

        Args:
            use_idf: Whether to use IDF weighting
            norm: Normalization method ('l1', 'l2', or None)
        """
        self.vectorizer = TFIDFVectorizer(use_idf=use_idf, norm=norm)
        self.document_vectors = None
        self.documents = None

    def fit(self, documents: List[str]):
        """
        Fit model to documents

        Args:
            documents: List of document strings
        """
        self.documents = documents
        self.document_vectors = self.vectorizer.fit_transform(documents)

    def search(self, query: str, top_k: int = 5) -> List[tuple]:
        """
        Search for documents similar to query

        Args:
            query: Query string
            top_k: Number of top results to return

        Returns:
            List of (document_index, similarity_score) tuples
        """
        # Transform query to vector
        query_vector = self.vectorizer.transform([query])

        # Calculate similarities
        similarities = []
        for doc_idx in range(len(self.documents)):
            sim = 1 - cosine(query_vector[0], self.document_vectors[doc_idx])
            similarities.append((doc_idx, sim))

        # Sort by similarity and return top k
        return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]

    def document_similarity(self, doc_id1: int, doc_id2: int) -> float:
        """
        Calculate similarity between two documents

        Args:
            doc_id1: Index of first document
            doc_id2: Index of second document

        Returns:
            Cosine similarity between documents
        """
        return 1 - cosine(
            self.document_vectors[doc_id1], self.document_vectors[doc_id2]
        )


# Example usage
if __name__ == "__main__":
    # Sample documents
    documents = [
        "The quick brown fox jumps over the lazy dog",
        "Quick brown foxes jump over lazy dogs",
        "The lazy dog sleeps in the sun",
        "A quick brown dog runs in the park",
        "Foxes and dogs play in the garden",
    ]

    # Initialize and fit model
    vsm = VectorSpaceModel(use_idf=True, norm="l2")
    vsm.fit(documents)

    # Example 1: Search query
    print("Search Results:")
    query = "quick brown fox"
    results = vsm.search(query, top_k=3)
    for doc_idx, score in results:
        print(f"Document {doc_idx}: '{documents[doc_idx]}' (Score: {score:.3f})")

    # Example 2: Document similarity
    print("\nDocument Similarities:")
    for i in range(len(documents) - 1):
        sim = vsm.document_similarity(i, i + 1)
        print(f"Similarity between doc {i} and {i+1}: {sim:.3f}")

    # Example 3: Show document vectors
    print("\nDocument Vectors (first 5 terms):")
    feature_names = vsm.vectorizer.get_feature_names()[:5]
    for i, doc in enumerate(documents):
        vector = vsm.document_vectors[i, :5]
        print(f"Doc {i}: {dict(zip(feature_names, vector))}")

Search Results:
Document 0: 'The quick brown fox jumps over the lazy dog' (Score: 0.596)
Document 1: 'Quick brown foxes jump over lazy dogs' (Score: 0.258)
Document 3: 'A quick brown dog runs in the park' (Score: 0.226)

Document Similarities:
Similarity between doc 0 and 1: 0.356
Similarity between doc 1 and 2: 0.085
Similarity between doc 2 and 3: 0.249
Similarity between doc 3 and 4: 0.114

Document Vectors (first 5 terms):
Doc 0: {'the': 0.4263810613259306, 'brown': 0.2578563540460651, 'lazy': 0.2578563540460651, 'dog': 0.2578563540460651, 'quick': 0.2578563540460651}
Doc 1: {'the': 0.0, 'brown': 0.29839156796488747, 'lazy': 0.29839156796488747, 'dog': 0.0, 'quick': 0.29839156796488747}
Doc 2: {'the': 0.46941847923824787, 'brown': 0.0, 'lazy': 0.28388347550384463, 'dog': 0.28388347550384463, 'quick': 0.0}
Doc 3: {'the': 0.21567813286992524, 'brown': 0.26086513700377395, 'lazy': 0.0, 'dog': 0.26086513700377395, 'quick': 0.26086513700377395}
Doc 4: {'the': 0.21377556692631727, 'brown