# Import thư viện cần thiết

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.metrics import pairwise_distances
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.utils import resample
import time
import os

# Implement Gaussian Random Projection

In [2]:
def apply_gaussian_projection(X, target_dim=100, subsample_size=None, term_weighting=None, pair_indices=(0, 100), to_dense_after_projection=True):
    """
    Implement Gaussian Random Projection (nâng cấp với term weighting và focus on 1 pair như paper).
    
    Parameters:
    - X: Input data matrix (n_samples x n_features), có thể sparse
    - target_dim: Reduced dimension (k)
    - subsample_size: Nếu != None, subsample random xuống subsample_size samples để giảm kích thước
    - term_weighting: 'sqrt' hoặc 'log' để apply weighting (giảm kurtosis heavy-tailed); None/raw = no weighting
    - pair_indices: Tuple (i,j) để compute error chỉ trên pair này (như word pair trong paper, nhanh hơn full pairwise)
    - to_dense_after_projection: Convert projected data sang dense để tránh lỗi sparse norm (default True)
    
    Returns:
    - dict with 'projected_data', 'time_taken', 'relative_error' (trên pair)
    """
    if subsample_size is not None and subsample_size < X.shape[0]:
        X = resample(X, n_samples=subsample_size, random_state=42)
    
    # Apply term weighting nếu cần (giữ sparse nếu có)
    if term_weighting == 'sqrt':
        X = X.power(0.5) if hasattr(X, 'power') else np.sqrt(X)
    elif term_weighting == 'log':
        X = np.log1p(X)
    
    # Chuyển dense nếu cần cho dist calculation
    if hasattr(X, 'toarray'):
        X_dense = X.toarray()
    else:
        X_dense = X
    
    # Original distance trên pair (như u1, u2 trong paper)
    u1, u2 = X_dense[pair_indices[0]], X_dense[pair_indices[1]]
    dist_original = np.linalg.norm(u1 - u2)
    
    # Gaussian Projection
    grp = GaussianRandomProjection(n_components=target_dim, random_state=42)
    start_time = time.time()
    X_projected = grp.fit_transform(X)
    time_taken = time.time() - start_time
    
    # Convert sang dense nếu cần để tránh lỗi norm trên sparse
    if to_dense_after_projection and hasattr(X_projected, 'toarray'):
        X_projected = X_projected.toarray()
    
    # Projected distance trên pair
    v1, v2 = X_projected[pair_indices[0]], X_projected[pair_indices[1]]
    dist_projected = np.linalg.norm(v1 - v2)
    error = np.abs(dist_projected - dist_original) / dist_original if dist_original > 0 else 0
    
    return {
        'projected_data': X_projected,
        'time_taken': time_taken,
        'relative_error': error
    }

# Implement Sparse Random Projection

In [3]:
def apply_sparse_projection(X, target_dim=100, density=None, subsample_size=None, term_weighting=None, pair_indices=(0, 100), to_dense_after_projection=True):
    """
    Implement Sparse Random Projection (nâng cấp với term weighting, focus on pair).
    
    Parameters:
    - X: Input data matrix (n_samples x n_features), có thể sparse
    - target_dim: Reduced dimension (k)
    - density: Density for sparsity (if None, use 1/sqrt(D) for Very Sparse)
    - subsample_size: Nếu != None, subsample random xuống subsample_size samples để giảm kích thước
    - term_weighting: 'sqrt' hoặc 'log' để apply weighting
    - pair_indices: Tuple (i,j) để compute error chỉ trên pair
    - to_dense_after_projection: Convert projected data sang dense để tránh lỗi sparse norm (default True)
    
    Returns:
    - dict with 'projected_data', 'time_taken', 'relative_error', 'density_used'
    """
    if subsample_size is not None and subsample_size < X.shape[0]:
        X = resample(X, n_samples=subsample_size, random_state=42)
    
    if density is None:
        density = 1 / np.sqrt(X.shape[1])  # Very Sparse as per paper
    
    # Apply term weighting
    if term_weighting == 'sqrt':
        X = X.power(0.5) if hasattr(X, 'power') else np.sqrt(X)
    elif term_weighting == 'log':
        X = np.log1p(X)
    
    if hasattr(X, 'toarray'):
        X_dense = X.toarray()
    else:
        X_dense = X
    
    # Original distance trên pair
    u1, u2 = X_dense[pair_indices[0]], X_dense[pair_indices[1]]
    dist_original = np.linalg.norm(u1 - u2)
    
    # Sparse Projection
    srp = SparseRandomProjection(n_components=target_dim, density=density, random_state=42)
    start_time = time.time()
    X_projected = srp.fit_transform(X)
    time_taken = time.time() - start_time
    
    # Convert sang dense nếu cần
    if to_dense_after_projection and hasattr(X_projected, 'toarray'):
        X_projected = X_projected.toarray()
    
    # Projected distance
    v1, v2 = X_projected[pair_indices[0]], X_projected[pair_indices[1]]
    dist_projected = np.linalg.norm(v1 - v2)
    error = np.abs(dist_projected - dist_original) / dist_original if dist_original > 0 else 0
    
    return {
        'projected_data': X_projected,
        'time_taken': time_taken,
        'relative_error': error,
        'density_used': density
    }

# Output

In [4]:
os.makedirs('results', exist_ok=True)

# Load data
data = fetch_20newsgroups_vectorized(subset='train')
X = data.data  # sparse
X = resample(X, n_samples=1000, random_state=42)  # nhỏ để nhanh

# Test hàm đã fix
res_g = apply_gaussian_projection(X, target_dim=300, term_weighting='sqrt')
print(f"Gaussian: Time={res_g['time_taken']:.4f}s, Error={res_g['relative_error']:.4f}")

res_s = apply_sparse_projection(X, target_dim=300, term_weighting='sqrt')
print(f"Sparse: Time={res_s['time_taken']:.4f}s, Error={res_s['relative_error']:.4f}, Density={res_s['density_used']:.4f}")

Gaussian: Time=1.3510s, Error=0.0112
Sparse: Time=0.3091s, Error=0.0627, Density=0.0028
