In [2]:
# Cell 1: Import libraries and setup
import os
import json
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

# Create directory structure for outputs
data_dir = "Data"
os.makedirs(data_dir, exist_ok=True)

pca_dir = os.path.join(data_dir, "PCA")
pacmap_dir = os.path.join(data_dir, "PACMAP")
umap_dir = os.path.join(data_dir, "UMAP")

for directory in [pca_dir, pacmap_dir, umap_dir]:
    os.makedirs(directory, exist_ok=True)

print(f"Created output directories in {data_dir}")

Created output directories in Data


In [3]:
# Cell 2: Define data loading function
def load_biased_mnist(base_path, bias_level='full', split='train', 
                      max_samples=None, resize=None, normalize=True):
    if split == 'test':
        image_folder = os.path.join(base_path, 'test')
        json_path = None
    else:
        image_folder = os.path.join(base_path, bias_level, 'trainval')
        json_path = os.path.join(base_path, bias_level, 'trainval.json')
    
    if split in ['train', 'val']:
        indices_path = os.path.join(base_path, f"{split}_ixs.json")
        try:
            with open(indices_path, 'r') as f:
                indices = json.load(f)
                if max_samples is not None:
                    indices = indices[:max_samples]
        except FileNotFoundError:
            print(f"Warning: Could not find indices file {indices_path}")
            return np.array([]), np.array([]), []
    
    metadata = []
    if json_path and os.path.exists(json_path):
        try:
            with open(json_path, 'r') as f:
                metadata_list = json.load(f)
                metadata_dict = {item['index']: item for item in metadata_list}
        except FileNotFoundError:
            print(f"Warning: Could not find metadata file {json_path}")
            metadata_dict = {}
    else:
        metadata_dict = {}
    
    images = []
    labels = []
    image_metadata = []
    
    if split in ['train', 'val']:
        print(f"Loading {len(indices)} {split} images from {image_folder}...")
        for idx in tqdm(indices, desc=f"Loading {split} data"):
            img_path = os.path.join(image_folder, f"{idx}.jpg")
            if not os.path.exists(img_path):
                print(f"Warning: Image {img_path} not found.")
                continue
            
            try:
                img = Image.open(img_path)
                if resize:
                    img = img.resize(resize)
                img_array = np.array(img)
                
                if json_path and idx in metadata_dict:
                    item = metadata_dict[idx]
                    label = item['digit']
                    image_metadata.append(item)
                else:
                    label = int(os.path.basename(img_path).split('.')[0]) % 10
                    image_metadata.append(None)
                
                images.append(img_array)
                labels.append(label)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
    else:
        image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.jpeg', '.png'))]
        image_files.sort()
        if max_samples is not None:
            image_files = image_files[:max_samples]
        
        print(f"Loading {len(image_files)} test images from {image_folder}...")
        for filename in tqdm(image_files, desc="Loading test data"):
            img_path = os.path.join(image_folder, filename)
            try:
                img = Image.open(img_path)
                if resize:
                    img = img.resize(resize)
                img_array = np.array(img)
                
                label = int(filename.split('.')[0]) % 10
                
                images.append(img_array)
                labels.append(label)
                image_metadata.append(None)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
    
    images = np.array(images)
    labels = np.array(labels, dtype=np.int64)
    
    if normalize and images.size > 0:
        images = images.astype(np.float32) / 255.0
    
    print(f"Successfully loaded {len(images)} images with shape {images.shape}")
    
    if len(labels) > 0:
        label_counts = np.bincount(labels, minlength=10)
        print("Label distribution:")
        for i, count in enumerate(label_counts):
            print(f"  Digit {i}: {count} images ({count/len(labels)*100:.1f}%)")
    
    return images, labels, image_metadata

In [4]:
# Cell 3: Define summary functions
def create_dataset_summary(datasets, bias_levels):
    summary_data = []
    
    for bias_level in bias_levels:
        if bias_level in datasets:
            data = datasets[bias_level]
            
            summary_data.append({
                'bias_level': bias_level,
                'num_samples': len(data['images']),
                'image_shape': str(data['images'].shape[1:]),
                'num_classes': len(np.unique(data['labels'])),
                'data_type': str(data['images'].dtype)
            })
            
            label_counts = np.bincount(data['labels'], minlength=10)
            for i, count in enumerate(label_counts):
                summary_data.append({
                    'bias_level': bias_level,
                    'digit': i,
                    'count': count,
                    'percentage': count/len(data['labels'])*100 if len(data['labels']) > 0 else 0
                })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(os.path.join(data_dir, "dataset_summary.csv"), index=False)
    print(f"Dataset summary saved to {os.path.join(data_dir, 'dataset_summary.csv')}")

def create_combined_results_files():
    # Combine PCA data
    pca_files = [f for f in os.listdir(pca_dir) if f.startswith('pca_data_')]
    if pca_files:
        pca_dfs = []
        for file in pca_files:
            bias_level = file.replace('pca_data_', '').replace('.csv', '')
            df = pd.read_csv(os.path.join(pca_dir, file))
            df['bias_level'] = bias_level
            pca_dfs.append(df)
        
        combined_pca = pd.concat(pca_dfs, ignore_index=True)
        combined_pca.to_csv(os.path.join(data_dir, "combined_pca.csv"), index=False)
        print(f"Combined PCA data saved to {os.path.join(data_dir, 'combined_pca.csv')}")
    
    # Combine PACMAP data
    pacmap_files = [f for f in os.listdir(pacmap_dir) if f.startswith('pacmap_')]
    if pacmap_files:
        pacmap_dfs = []
        for file in pacmap_files:
            bias_level = file.replace('pacmap_', '').replace('.csv', '')
            df = pd.read_csv(os.path.join(pacmap_dir, file))
            df['bias_level'] = bias_level
            pacmap_dfs.append(df)
        
        combined_pacmap = pd.concat(pacmap_dfs, ignore_index=True)
        combined_pacmap.to_csv(os.path.join(data_dir, "combined_pacmap.csv"), index=False)
        print(f"Combined PACMAP data saved to {os.path.join(data_dir, 'combined_pacmap.csv')}")
    
    # Combine UMAP data
    umap_files = [f for f in os.listdir(umap_dir) if f.startswith('umap_')]
    if umap_files:
        umap_dfs = []
        for file in umap_files:
            bias_level = file.replace('umap_', '').replace('.csv', '')
            df = pd.read_csv(os.path.join(umap_dir, file))
            df['bias_level'] = bias_level
            umap_dfs.append(df)
        
        combined_umap = pd.concat(umap_dfs, ignore_index=True)
        combined_umap.to_csv(os.path.join(data_dir, "combined_umap.csv"), index=False)
        print(f"Combined UMAP data saved to {os.path.join(data_dir, 'combined_umap.csv')}")

In [5]:
# Cell 4: PCA Implementation
def perform_pca_and_save(images, labels, bias_level, n_components=None):
    print(f"Performing PCA for {bias_level}...")
    
    n_samples = images.shape[0]
    images_flattened = images.reshape(n_samples, -1)
    print(f"Flattened image shape: {images_flattened.shape}")
    
    max_components = min(n_samples, images_flattened.shape[1])
    
    if n_components is None or n_components > max_components:
        n_components = max_components
        print(f"Setting n_components to maximum possible: {n_components}")
    
    scaler = StandardScaler()
    images_scaled = scaler.fit_transform(images_flattened)
    
    pca = PCA(n_components=n_components)
    images_pca = pca.fit_transform(images_scaled)
    print(f"PCA-transformed data shape: {images_pca.shape}")
    
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance_ratio)
    print(f"Total variance explained by {n_components} components: {cumulative_variance[-1]:.4f}")
    
    variance_thresholds = {}
    for threshold in [0.8, 0.9, 0.95]:
        n_comp = np.argmax(cumulative_variance >= threshold) + 1 if any(cumulative_variance >= threshold) else n_components
        variance_thresholds[threshold] = n_comp
        print(f"Components needed for {threshold*100:.0f}% variance: {n_comp}")
    
    # Save results
    output_components = min(50, images_pca.shape[1])
    pca_data = pd.DataFrame(images_pca[:, :output_components])
    pca_data.columns = [f'PC{i+1}' for i in range(output_components)]
    pca_data['label'] = labels
    pca_data.to_csv(os.path.join(pca_dir, f"pca_data_{bias_level}.csv"), index=False)
    
    variance_df = pd.DataFrame({
        'component': range(1, len(explained_variance_ratio) + 1),
        'explained_variance_ratio': explained_variance_ratio,
        'cumulative_variance': cumulative_variance
    })
    variance_df.to_csv(os.path.join(pca_dir, f"pca_variance_{bias_level}.csv"), index=False)
    
    variance_threshold_df = pd.DataFrame([
        {'bias_level': bias_level, 'threshold': t, 'components_needed': n}
        for t, n in variance_thresholds.items()
    ])
    variance_threshold_df.to_csv(os.path.join(pca_dir, f"pca_thresholds_{bias_level}.csv"), index=False)
    
    loadings_components = min(10, pca.components_.shape[0])
    loadings_df = pd.DataFrame(pca.components_[:loadings_components, :])
    loadings_df.index = [f'PC{i+1}' for i in range(loadings_components)]
    loadings_df.to_csv(os.path.join(pca_dir, f"pca_loadings_{bias_level}.csv"))
    
    print(f"PCA results for {bias_level} saved to {pca_dir}")
    
    return pca, images_pca, explained_variance_ratio

In [6]:
# Cell 5: PACMAP Implementation
def perform_pacmap_and_save(images, labels, bias_level):
    try:
        import pacmap
        print(f"Performing PACMAP for {bias_level}...")
        
        n_samples = images.shape[0]
        images_flattened = images.reshape(n_samples, -1)
        
        scaler = StandardScaler()
        images_scaled = scaler.fit_transform(images_flattened)
        
        embedder = pacmap.PaCMAP(n_components=3, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0)
        embedding = embedder.fit_transform(images_scaled)
        
        # Save results
        embedding_df = pd.DataFrame(embedding, columns=['PACMAP1', 'PACMAP2', 'PACMAP3'])
        embedding_df['label'] = labels
        
        embedding_df.to_csv(os.path.join(pacmap_dir, f"pacmap_{bias_level}.csv"), index=False)
        print(f"PACMAP results for {bias_level} saved to {pacmap_dir}")
        
        return embedding
    
    except ImportError:
        print("PACMAP not installed. To install, run: pip install pacmap")
        return None

In [7]:
# Cell 6: UMAP Implementation
def perform_umap_and_save(images, labels, bias_level):
    try:
        import umap
        print(f"Performing UMAP for {bias_level}...")
        
        n_samples = images.shape[0]
        images_flattened = images.reshape(n_samples, -1)
        
        scaler = StandardScaler()
        images_scaled = scaler.fit_transform(images_flattened)
        
        reducer = umap.UMAP(n_components=3, n_neighbors=15, min_dist=0.1, random_state=42)
        embedding = reducer.fit_transform(images_scaled)
        
        # Save results
        embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2', 'UMAP3'])
        embedding_df['label'] = labels
        
        embedding_df.to_csv(os.path.join(umap_dir, f"umap_{bias_level}.csv"), index=False)
        print(f"UMAP results for {bias_level} saved to {umap_dir}")
        
        return embedding
    
    except ImportError:
        print("UMAP not installed. To install, run: pip install umap-learn")
        return None

In [9]:
# Cell 7: Main execution function
def main():
    bias_levels = ['full_0.1', 'full_0.5', 'full_0.99']
    
    datasets = {}
    
    for bias_level in bias_levels:
        print(f"\nProcessing bias level: {bias_level}")
        
        # Load data
        images, labels, metadata = load_biased_mnist(
            "biased_mnist",
            bias_level=bias_level,
            split='train',
            max_samples=1000,
            resize=(28, 28)
        )
        
        if len(images) == 0:
            print(f"No images found for bias level {bias_level}, skipping.")
            continue
        
        datasets[bias_level] = {
            'images': images,
            'labels': labels,
            'metadata': metadata
        }
        
        # Perform dimensionality reduction techniques
        perform_pca_and_save(images, labels, bias_level)
        perform_pacmap_and_save(images, labels, bias_level)
        perform_umap_and_save(images, labels, bias_level)
    
    # Create summary files
    create_dataset_summary(datasets, bias_levels)
    create_combined_results_files()
    
    print("\nAll processing complete. Data saved to the 'Data' directory.")

In [10]:
# Cell 8: Execution
# Run the main function if executed directly
if __name__ == "__main__":
    main()
else:
    print("Run main() to process the dataset")


Processing bias level: full_0.1
Loading 1000 train images from biased_mnist/full_0.1/trainval...


Loading train data: 100%|█████████████████| 1000/1000 [00:00<00:00, 1244.06it/s]


Successfully loaded 1000 images with shape (1000, 28, 28, 3)
Label distribution:
  Digit 0: 92 images (9.2%)
  Digit 1: 109 images (10.9%)
  Digit 2: 98 images (9.8%)
  Digit 3: 96 images (9.6%)
  Digit 4: 118 images (11.8%)
  Digit 5: 78 images (7.8%)
  Digit 6: 108 images (10.8%)
  Digit 7: 95 images (9.5%)
  Digit 8: 90 images (9.0%)
  Digit 9: 116 images (11.6%)
Performing PCA for full_0.1...
Flattened image shape: (1000, 2352)
Setting n_components to maximum possible: 1000
PCA-transformed data shape: (1000, 1000)
Total variance explained by 1000 components: 1.0000
Components needed for 80% variance: 157
Components needed for 90% variance: 293
Components needed for 95% variance: 438
PCA results for full_0.1 saved to Data/PCA


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Note: `n_components != 2` have not been thoroughly tested.


Performing PACMAP for full_0.1...
PACMAP results for full_0.1 saved to Data/PACMAP
Performing UMAP for full_0.1...


  warn(


UMAP results for full_0.1 saved to Data/UMAP

Processing bias level: full_0.5
Loading 1000 train images from biased_mnist/full_0.5/trainval...


Loading train data: 100%|█████████████████| 1000/1000 [00:00<00:00, 1232.50it/s]


Successfully loaded 1000 images with shape (1000, 28, 28, 3)
Label distribution:
  Digit 0: 92 images (9.2%)
  Digit 1: 109 images (10.9%)
  Digit 2: 98 images (9.8%)
  Digit 3: 96 images (9.6%)
  Digit 4: 118 images (11.8%)
  Digit 5: 78 images (7.8%)
  Digit 6: 108 images (10.8%)
  Digit 7: 95 images (9.5%)
  Digit 8: 90 images (9.0%)
  Digit 9: 116 images (11.6%)
Performing PCA for full_0.5...
Flattened image shape: (1000, 2352)
Setting n_components to maximum possible: 1000


Note: `n_components != 2` have not been thoroughly tested.


PCA-transformed data shape: (1000, 1000)
Total variance explained by 1000 components: 1.0000
Components needed for 80% variance: 145
Components needed for 90% variance: 275
Components needed for 95% variance: 418
PCA results for full_0.5 saved to Data/PCA
Performing PACMAP for full_0.5...
PACMAP results for full_0.5 saved to Data/PACMAP
Performing UMAP for full_0.5...


  warn(


UMAP results for full_0.5 saved to Data/UMAP

Processing bias level: full_0.99
Loading 1000 train images from biased_mnist/full_0.99/trainval...


Loading train data: 100%|█████████████████| 1000/1000 [00:00<00:00, 1333.37it/s]


Successfully loaded 1000 images with shape (1000, 28, 28, 3)
Label distribution:
  Digit 0: 92 images (9.2%)
  Digit 1: 109 images (10.9%)
  Digit 2: 98 images (9.8%)
  Digit 3: 96 images (9.6%)
  Digit 4: 118 images (11.8%)
  Digit 5: 78 images (7.8%)
  Digit 6: 108 images (10.8%)
  Digit 7: 95 images (9.5%)
  Digit 8: 90 images (9.0%)
  Digit 9: 116 images (11.6%)
Performing PCA for full_0.99...
Flattened image shape: (1000, 2352)
Setting n_components to maximum possible: 1000


Note: `n_components != 2` have not been thoroughly tested.


PCA-transformed data shape: (1000, 1000)
Total variance explained by 1000 components: 1.0000
Components needed for 80% variance: 91
Components needed for 90% variance: 196
Components needed for 95% variance: 324
PCA results for full_0.99 saved to Data/PCA
Performing PACMAP for full_0.99...
PACMAP results for full_0.99 saved to Data/PACMAP
Performing UMAP for full_0.99...


  warn(


UMAP results for full_0.99 saved to Data/UMAP
Dataset summary saved to Data/dataset_summary.csv
Combined PCA data saved to Data/combined_pca.csv
Combined PACMAP data saved to Data/combined_pacmap.csv
Combined UMAP data saved to Data/combined_umap.csv

All processing complete. Data saved to the 'Data' directory.
