# Import thư viện cần thiết

In [1]:
import numpy as np
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.metrics import pairwise_distances
from sklearn.datasets import fetch_20newsgroups_vectorized
import time
import pandas as pd
from sklearn.utils import resample  # Để random subsample

# Implement Gaussian Random Projection

In [2]:
def apply_gaussian_projection(X, target_dim=100, subsample_size=None):
    """
    Implement Gaussian Random Projection.
    
    Parameters:
    - X: Input data matrix (n_samples x n_features), có thể sparse
    - target_dim: Reduced dimension (k)
    - subsample_size: Nếu != None, subsample random xuống subsample_size samples để giảm kích thước
    
    Returns:
    - dict with 'projected_data', 'time_taken', 'mean_relative_error'
    """
    if subsample_size is not None and subsample_size < X.shape[0]:
        # Random subsample để giảm data
        X = resample(X, n_samples=subsample_size, random_state=42)
    
    # Nếu X sparse, pairwise_distances hỗ trợ, nhưng để error chính xác, toarray() nếu cần
    if hasattr(X, 'toarray'):
        X_dense = X.toarray()
    else:
        X_dense = X

    # Compute original distances (dùng metric='euclidean' cho dense)
    dist_original = pairwise_distances(X_dense, metric='euclidean')
    mask = dist_original > 0

    # Gaussian Projection (hỗ trợ sparse input)
    grp = GaussianRandomProjection(n_components=target_dim, random_state=42)
    start_time = time.time()
    X_projected = grp.fit_transform(X)
    time_taken = time.time() - start_time

    # Compute projected distances and error
    dist_projected = pairwise_distances(X_projected)
    error = np.mean(np.abs(dist_projected[mask] - dist_original[mask]) / dist_original[mask])

    return {
        'projected_data': X_projected,
        'time_taken': time_taken,
        'mean_relative_error': error
    }

# Implement Sparse Random Projection

In [3]:
def apply_sparse_projection(X, target_dim=100, density=None, subsample_size=None):
    """
    Implement Sparse Random Projection (Very Sparse variant as in paper).
    
    Parameters:
    - X: Input data matrix (n_samples x n_features), có thể sparse
    - target_dim: Reduced dimension (k)
    - density: Density for sparsity (if None, use 1/sqrt(D) for Very Sparse)
    - subsample_size: Nếu != None, subsample random xuống subsample_size samples để giảm kích thước
    
    Returns:
    - dict with 'projected_data', 'time_taken', 'mean_relative_error', 'density_used'
    """
    if subsample_size is not None and subsample_size < X.shape[0]:
        # Random subsample để giảm data
        X = resample(X, n_samples=subsample_size, random_state=42)
    
    if density is None:
        density = 1 / np.sqrt(X.shape[1])  # Very Sparse as per paper (s = sqrt(D))
    
    # Nếu X sparse, pairwise_distances hỗ trợ
    if hasattr(X, 'toarray'):
        X_dense = X.toarray()
    else:
        X_dense = X

    # Compute original distances
    dist_original = pairwise_distances(X_dense, metric='euclidean')
    mask = dist_original > 0

    # Sparse Projection (hỗ trợ sparse input)
    srp = SparseRandomProjection(n_components=target_dim, density=density, random_state=42)
    start_time = time.time()
    X_projected = srp.fit_transform(X)
    time_taken = time.time() - start_time

    # Compute projected distances and error
    dist_projected = pairwise_distances(X_projected)
    error = np.mean(np.abs(dist_projected[mask] - dist_original[mask]) / dist_original[mask])

    return {
        'projected_data': X_projected,
        'time_taken': time_taken,
        'mean_relative_error': error,
        'density_used': density
    }

# Output

In [4]:
#20 Newsgroups (high-dim text, phù hợp paper) - subsample để nhỏ
data = fetch_20newsgroups_vectorized(subset='train')
X = data.data  # Giữ sparse

# Optional: Term weighting for heavy-tailed (paper Sec 5)
# X = X.power(0.5)  # Square root (giữ sparse)

# Apply with subsample (e.g., 1000 samples)
gaussian_results = apply_gaussian_projection(X, target_dim=100, subsample_size=1000)
print(f"Gaussian: Time = {gaussian_results['time_taken']:.4f}s, Error = {gaussian_results['mean_relative_error']:.4f}")

sparse_results = apply_sparse_projection(X, target_dim=100, subsample_size=1000)
print(f"Sparse (density={sparse_results['density_used']:.4f}): Time = {sparse_results['time_taken']:.4f}s, Error = {sparse_results['mean_relative_error']:.4f}")

Gaussian: Time = 0.4904s, Error = 0.0560
Sparse (density=0.0028): Time = 0.0989s, Error = 0.1306
