## Estimate Intrinsic Dimension of Wikipedia Embeddings

In [None]:
# Get Cohere Wikipedia embeddings from Hugging Face

from huggingface_hub import login
from datasets import load_dataset

import time

# Login to Hugging Face
login(token="YourOwnHuggingFaceToken")

print("Loading the complete Cohere Wikipedia embeddings dataset...")
start_time = time.time()

# Load the entire dataset (non-streaming mode)
# This will load everything into memory at once
docs = load_dataset("Cohere/wikipedia-22-12-en-embeddings", split="train")

loading_time = time.time() - start_time
print(f"Dataset loaded in {loading_time:.2f} seconds")
# print(f"Dataset size: {len(docs['train'])} examples")

In [None]:
# Convert embedding vectors from dataset to numpy array

import numpy as np
import os

# Get the total number of rows
num_rows = len(docs)
print(f"Total number of rows: {num_rows}")

# Define chunk size for processing and saving
# Each embedding is 768 * 4 bytes (float32) ≈ 3 KB
# Processing 1M embeddings at a time ≈ 3 GB of memory
chunk_size = 1000000
num_chunks = (num_rows + chunk_size - 1) // chunk_size

output_dir = "D:/WikipediaEmbeddings_Cohere/wikipedia_embeddings_chunks"
os.makedirs(output_dir, exist_ok=True)

print("Processing and saving embeddings in chunks...")
extract_start = time.time()

for chunk_idx in range(num_chunks):
    start_idx = chunk_idx * chunk_size
    end_idx = min((chunk_idx + 1) * chunk_size, num_rows)
    current_size = end_idx - start_idx
    
    print(f"Processing chunk {chunk_idx+1}/{num_chunks} (rows {start_idx} to {end_idx-1})...")
    
    # Process this chunk in smaller batches
    batch_size = 10000
    num_batches = (current_size + batch_size - 1) // batch_size
    
    # Initialize array just for this chunk
    chunk_embeddings = np.zeros((current_size, 768), dtype=np.float32)
    
    for batch_idx in range(num_batches):
        batch_start = start_idx + batch_idx * batch_size
        batch_end = min(start_idx + (batch_idx + 1) * batch_size, end_idx)
        
        # FIXED: Properly access Hugging Face dataset
        # Get batch of examples using the proper slicing for datasets.arrow_dataset.Dataset
        batch = docs.select(range(batch_start, batch_end))
        
        # FIXED: Extract embeddings using the proper method for accessing features
        # Assuming 'emb' is a feature in your dataset
        batch_embeddings = np.array(batch['emb'], dtype=np.float32)
        
        # Store in the chunk array
        local_start = batch_start - start_idx
        local_end = batch_end - start_idx
        chunk_embeddings[local_start:local_end] = batch_embeddings
        
        if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == num_batches:
            print(f"  Processed batch {batch_idx+1}/{num_batches}")
    
    # Save this chunk to disk
    chunk_filename = os.path.join(output_dir, f"embeddings_chunk_{chunk_idx:03d}.npy")
    np.save(chunk_filename, chunk_embeddings)
    print(f"Saved chunk {chunk_idx+1} to {chunk_filename}")
    
    # Free memory
    del chunk_embeddings
    
print(f"All processing completed in {time.time() - extract_start:.2f} seconds")

# Create a metadata file with information about the chunks
with open(os.path.join(output_dir, "metadata.txt"), "w") as f:
    f.write(f"Total embeddings: {num_rows}\n")
    f.write(f"Embedding dimensions: 768\n")
    f.write(f"Number of chunks: {num_chunks}\n")
    f.write(f"Chunk size: {chunk_size}\n")
    f.write("Data type: float32\n")
    f.write("File format: NumPy .npy\n")

# OPTIONAL: To recover disk space on SSD C:, manually delete downloaded dataset here after execution of this cell: C:\Users\{username}\.cache\huggingface\datasets\

In [None]:
# Bring embeddings into SSD C: from slower USB drive D:

import shutil

# This won't fail if destination exists (Python 3.8+)
shutil.copytree("D:/WikipediaEmbeddings_Cohere/wikipedia_embeddings_chunks", "C:/Users/rozmu/Documents/JupyterLabStuff/IntrinsicDimWikipedia/wikipedia_embeddings_chunks", dirs_exist_ok=True)

In [None]:
# Get randomly sampled X% of embedding vectors

import numpy as np
import os
import random
from pathlib import Path

X = 0.1

def reservoir_sample_vectors(directory, sample_percentage, seed=None):
    """
    Memory-efficient reservoir sampling of vectors from .npy files.
    This approach maintains constant memory usage regardless of total data size.
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
    
    npy_files = list(Path(directory).glob("*.npy"))
    if not npy_files:
        raise ValueError(f"No .npy files found in {directory}")
    
    # First pass: count total vectors
    total_vectors = 0
    for file_path in npy_files:
        arr = np.load(file_path, mmap_mode='r')
        total_vectors += arr.shape[0]
    
    sample_count = int(total_vectors * sample_percentage / 100)
    print(f"Reservoir sampling {sample_count:,} vectors from {total_vectors:,} total")
    
    # Initialize reservoir
    reservoir = None
    vectors_seen = 0
    
    for file_path in npy_files:
        arr = np.load(file_path, mmap_mode='r')
        
        for i in range(arr.shape[0]):
            vectors_seen += 1
            
            if reservoir is None:
                # Initialize reservoir with first vectors
                reservoir = np.zeros((sample_count, 768), dtype=arr.dtype)
            
            if len(reservoir) < sample_count:
                # Fill reservoir
                reservoir[len(reservoir)] = arr[i]
            else:
                # Replace with probability sample_count/vectors_seen
                j = random.randint(0, vectors_seen - 1)
                if j < sample_count:
                    reservoir[j] = arr[i]
        
        print(f"Processed {file_path.name}: {vectors_seen:,} vectors seen")
    
    return reservoir[:min(sample_count, vectors_seen)]

sampled_data = reservoir_sample_vectors("wikipedia_embeddings_chunks", X, seed=54)

In [None]:
# Estimate intrinsic dimension of sampled vectors by correlation dimension method.

import skdim
import os

os.environ['LOKY_MAX_CPU_COUNT'] = '1'

Sample_Count = sampled_data.shape[0]
for i in range(4):
    proportion_smallest = 0.5-(i+1)*0.1
    proportion_largest = 0.5+(i+1)*0.1
    ID_Estimator = skdim.id.CorrInt(int(proportion_smallest*Sample_Count),int(proportion_largest*Sample_Count))
    IntrinsicDim = ID_Estimator.fit_transform(sampled_data)
    print(proportion_smallest,proportion_largest,IntrinsicDim)