# UMAP SSL Baselines

In [10]:
import torch
import pandas as pd
from sklearn import datasets as sk_datasets
from sklearn.decomposition import PCA
from tqdm import tqdm
from umap import UMAP

from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
from torchvision import datasets, transforms, models
from torchvision.models import resnet

import os
import csv
from tabulate import tabulate
import time
import rarfile
import pickle

import numpy as np
from matplotlib import pyplot as plt
from IPython.core.debugger import set_trace

from CifarResnet import CifarResnet

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
# import sys
# !{sys.executable} -m pip install rarfile

# Load Data

In [7]:
# Cifar
cifar_data_loc = "../data/cifar10"
cifar_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                      transforms.RandomRotation(30),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
cifar_test_transform = transforms.ToTensor()

if not os.path.exists(cifar_data_loc):
    os.makedirs(cifar_data_loc)
    
cifar10_train = datasets.CIFAR10(cifar_data_loc, train=True, transform=cifar_transform, download=True)
cifar10_test = datasets.CIFAR10(cifar_data_loc, train=False, transform=cifar_test_transform, download=True)

class CustomDataset(Dataset):
    def __init__(self, items=None, labels=None):
        """Initializes the dataset
        Args:
            items (list of Tensor): a list of dataset items in Tensor format
            labels (list of Tensor): a list of labels in Tensor format
        """
        if items is not None:
            self.items = items
            self.labels = labels
        else:
            self.items = []
            self.labels = []
            
    def __getitem__(self, i):
        return self.items[i], self.labels[i]
    
    def __len__(self):
        return len(self.items)
    
    def append(self, item, label):
        self.items.append(item)
        self.labels.append(label)

Files already downloaded and verified
Files already downloaded and verified


# Define Embeddings

In [None]:
def get_identity_embedding():
    """As a baseline, create an 'embedding' that just returns the item itself
    Returns:
        embedding (nn.Module): an identity embedding
        embed_time (float): the time in seconds to train the embedding (0)
    """
    class IdentityEmbedding(nn.Module):
        def __init__(self):
            super(IdentityEmbedding, self).__init__()
            
        def forward(self, x):
            return x
        
    embedding = IdentityEmbedding()
    embed_time = 0
    
    return embedding, embed_time

def get_umap_embedding(data_unlabeled):
    """Trains a data embedding using UMAP
    Args:
        data_unlabeled (Dataset): the data used to train the embedding
    Returns:
        embedding (nn.Module): a trained embedding using the unlabeled samples
            from the dataset
        embed_time (float): the time in seconds to train the embedding
    """
    num_items = len(data_unlabeled)
    item_dim = len(data_unlabeled[0][0].view(-1))
    
    data_shape = data_unlabeled[0][0].size()
    data_matrix = np.zeros((num_items, item_dim))
    
    for i, (item, _) in enumerate(data_unlabeled):
        data_matrix[i, :] = item.view(-1).numpy()
        
    print("Loaded UMAP data")
    
    class UMAPEmbedding():
        def __init__(self):
            self.umap = UMAP()
            self.umap.fit_transform(data_matrix)
            
        def __call__(self, x):
            x = x.view(1, -1).numpy()
            return self.umap.transform(x).reshape(data_shape)
        
    start = time.time()
    embedding = UMAPEmbedding()
    embed_time = time.time() - start
    print("Embedded UMAP data")
    
    return embedding, embed_time


def get_umap_cheby_embedding(data_unlabeled):
    """Trains a data embedding using UMAP
    Args:
        data_unlabeled (Dataset): the data used to train the embedding
    Returns:
        embedding (nn.Module): a trained embedding using the unlabeled samples
            from the dataset
        embed_time (float): the time in seconds to train the embedding
    """
    class IdentityEmbedding(nn.Module):
        def __init__(self):
            super(IdentityEmbedding, self).__init__()
        def forward(self, x):
            return x
        
    embedding = IdentityEmbedding()
    embed_time = 0
    
    return embedding, embed_time

def get_vae_embedding(data_unlabeled):
    """Trains a data embedding using UMAP
    Args:
        data_unlabeled (Dataset): the data used to train the embedding
    Returns:
        embedding (nn.Module): a trained embedding using the unlabeled samples
            from the dataset
        embed_time (float): the time in seconds to train the embedding
    """
    class IdentityEmbedding(nn.Module):
        def __init__(self):
            super(IdentityEmbedding, self).__init__()
        def forward(self, x):
            return x
        
    embedding = IdentityEmbedding()
    embed_time = 0
    
    return embedding, embed_time

    
def get_pca_embedding(data_unlabeled):
    """Trains a data embedding using PCA
    Args:
        data_unlabeled (Dataset): the data used to train the embedding
    Returns:
        embedding (nn.Module): a trained embedding using the unlabeled samples
            from the dataset
        embed_time (float): the time in seconds to train the embedding
    """
    num_items = len(data_unlabeled)
    item_dim = len(data_unlabeled[0][0].view(-1))
    print(num_items, item_dim)
    self.break_the_code()
    data_shape = data_unlabeled[0][0].size()
    data_matrix = np.zeros(num_items, item_dim)
    data_matrix = []
    
    for i, (item, _) in enumerate(data_unlabeled):
        data_matrix[i, :] = item.view(-1).numpy()
        
    data_matrix = np.array(data_matrix)
    
    class PCAEmbedding():
        def __init__(self):
            self.pca = PCA(whiten=True)
            self.pca.fit_transform(data_matrix)
            
        def __call__(self, x):
            x = x.view(1, -1).numpy()
            return self.pca.transform(x).reshape(data_shape)
        
    start = time.time()
    embedding = PCAEmbedding()
    embed_time = time.time() - start
    
    return embedding, embed_time

In [12]:
def split_training_data(train, labeled=.1):
    """Splits the train data into labeled/unlabeled"""
    
    n_points = len(train)
    labeled_split = int(n_points * labeled)
    unlabeled_split = n_points - labeled_split
    labeled_data, unlabeled_data = random_split(train, [labeled_split, unlabeled_split])
    
    return labeled_data, unlabeled_data


def get_embeddings(embedding_names, unlabeled_data):
    """Trains embeddings and records how long each embedding took
    Args:
        embedding_names (list of str): a list of names of embedding techniques
        unlabeled_data (Dataset): a Dataset object holding the unlabeled data
    Returns:
        embeddings (list of nn.Module): a list of the embeddings
        embed_times (list of float): a list of corresponding times for training 
            the embeddings
    """
    embeddings = []
    embed_times = []
    
    for name in embedding_names:
        if name == 'umap':
            embedding, embed_time = get_umap_embedding(unlabeled_data)
            embeddings.append(embedding); embed_times.append(embed_time)
        elif name == 'tsne':
            embedding, embed_time = get_tsne_embedding(unlabeled_data)
            embeddings.append(embedding); embed_times.append(embed_time)
        elif name == 'pca':
            embedding, embed_time = get_pca_embedding(unlabeled_data)
            embeddings.append(embedding); embed_times.append(embed_time)
        elif name == 'umap-cheby':
            embedding, embed_time = get_umap_cheby_embedding(unlabeled_data)
            embeddings.append(embedding); embed_times.append(embed_time)
        elif name == 'vae':
            embedding, embed_time = get_vae_embedding(unlabeled_data)
            embeddings.append(embedding); embed_times.append(embed_time)
        elif name == 'none':
            # Get an identity embedding as a baseline
            embedding, embed_time = get_identity_embedding()
            embeddings.append(embedding); embed_times.append(embed_time)
        else:
            raise NameError(f"{name} is not a valid embedding")
            
    return embeddings, embed_times

def embed_data(embeddings, data):
    """Embeds the data with learned embeddings
    Args:
        embeddings (list of nn.Module): a list of learned embeddings
        labeled_data (Dataset): the dataset we will train on
    Returns:
        data_embedded (list of Dataset): a list of Dataset objects with the
            embedding and the original label
    """
    data_embedded = []
    
    dataloader = DataLoader(data, 
                                shuffle=False,
                                batch_size=1,
                                pin_memory=True)
    
    for embedding in embeddings:
        
        i = 0
        
        new_dataset = CustomDataset()
        
        for item, label in dataloader:
            
            item_embedded = embedding(item).detach()
            new_dataset.append(item_embedded, label)
            
            i += 1
            if i == 10:
                break
            
        data_embedded.append(new_dataset)
        
    return data_embedded

def get_algorithms(algorithm_names, output_dim):
    """Gets a list of specified algorithms
    Args:
        algorithms (list of str): a list of names of algorithms
        unlabeled_data (Dataset): a Dataset object holding the unlabeled data
    Returns:
        algorithms (list of algorithms): a list of the algorithms
    """
    
    algorithms = []
    
    for algorithm in algorithm_names:
        if algorithm == 'pi-model':
            algorithms.append(PiModel(output_dim))
        elif algorithm == 'self-training':
            algorithms.append(SelfTraining(output_dim))
        elif algorithm == 'cluster-label':
            algorithms.append(ClusterLabel(output_dim))
        elif algorithm == 'label-propagation':
            algorithms.append(LabelPropagation(output_dim))
        if algorithm == 'supervised':
            algorithms.append(SupervisedAlgorithm(output_dim, pretrained=False))
        elif algorithm == 'supervised-pretrained':
            algorithms.append(SupervisedAlgorithm(output_dim, pretrained=True))
        else:
            assert False, "Algorithm not found in list, {}".format(algorithm)
    return algorithms

def pickle_results(results_dir, avg_acc, avg_embed_time, algorithms, embeddings):
    """Save the accuracy and embedding time results as pickle files, as well as
    the algorithms and embeddings tested.
    Args:
        results_dir (str): the directory to save the results in
        avg_acc ((m, n) ndarray): the accuracies for each of the m algorithms
            with the n embeddings
        avg_embed_time ((n,) ndarray): the time to learn the embedding for each
            of the n embeddings
        algorithms (list of str): the algorithms tested
        embeddings (list of str): the embeddings tested
    """
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    pickle.dump( avg_acc, open( os.path.join(results_dir, "avg_acc_grid.p"), "wb" ) )
    pickle.dump( avg_embed_time, open( os.path.join(results_dir, "avg_embed_times.p"), "wb" ) )
    pickle.dump( algorithms, open( os.path.join(results_dir, "algorithms.p"), "wb" ) )
    pickle.dump( embeddings, open( os.path.join(results_dir, "embeddings.p"), "wb" ) )
    
def display_results(results_dir):
    """Load and display the accuracy and embedding time results
    Args:
        results_dir (str): the directory holding the results
    """
    avg_acc = pickle.load( open( os.path.join(results_dir, "avg_acc_grid.p"), "rb" ) )
    avg_embed_time = pickle.load( open( os.path.join(results_dir, "avg_embed_times.p"), "rb" ) )
    algorithms = pickle.load( open( os.path.join(results_dir, "algorithms.p"), "rb" ) )
    embeddings = pickle.load( open( os.path.join(results_dir, "embeddings.p"), "rb" ) )
    
    print("Accuracies")
    print(tabulate(avg_acc, headers=embeddings, showindex=algorithms))
    print("Embedding Times")
    print(tabulate(avg_embed_time, headers=embeddings))

In [14]:
def test_embedding():
    train_data = cifar10_train
    test_data = cifar10_test
    output_dim = 10
    # Proportion of labeled data
    labeled_pct = 0.1
    num_trials = 10
    epochs = 1
    results_dir = './results/basic_test'
    
    algorithm_names = ['supervised']
    
    embedding_names = ['umap']
    
    accuracies = []
    embed_times = []

        
    # Note that we select a new labeled/unlabeled split with each trial
    labeled_data, unlabeled_data = split_training_data(train_data, labeled_pct)

    embeddings, embed_times = get_embeddings(embedding_names, unlabeled_data)
    print("embedding time: ", embed_times[0])
    embedding = embeddings[0]
    print(embedding(train_data[0][0]).shape)
    # We possibly don't want nn.modules - we don't want to backprop on them
    # fix picking k. For some rea
test_embedding()

Here
Here
embedding time:  89.50514793395996


ValueError: cannot reshape array of size 2 into shape (3,32,32)

# Experiment Setup

In [13]:
def run_experiments():
    train_data = cifar10_train
    test_data = cifar10_test
    output_dim = 10
    # Proportion of labeled data
    labeled_pct = 0.1
    num_trials = 10
    epochs = 1
    results_dir = './results/basic_test'
    
#     algorithm_names = ['pi-model', 'self-training', 'cluster-label', 
#                   'label-propagation', 'supervised', 'supervised-pretrained']
#     embedding_names = ['umap', 'tsne', 'pca', 'umap-cheby', 'vae', 'none']
    algorithm_names = ['supervised']
    embedding_names = ['umap']
    
    accuracies = []
    embed_times = []

    
    for trial in range(num_trials):
        
        # Note that we select a new labeled/unlabeled split with each trial
        labeled_data, unlabeled_data = split_(train_data, labeled_pct)
        
        embeddings, trial_embed_times = get_embeddings(embedding_names, unlabeled_data)
        embed_times.append(trial_embed_times)
        print(f"Trial: {trial}, Embeddings Obtained")
        
        embedded_train_labeled = embed_data(embeddings, labeled_data)
        embedded_train_unlabeled = embed_data(embeddings, unlabeled_data)
        embedded_test_data = embed_data(embeddings, test_data)
        print(f"Trial: {trial}, Data Embedded")
        
        algorithms = get_algorithms(algorithm_names, output_dim)
        print(f"Trial: {trial}, Algorithms Obtained")
        
        # trial_accuracies is a list of list. trial_accuracies[i][j] denotes the 
        # accuracy of the ith algorithm on the jth embedding
        trial_accuracies = np.zeros((len(algorithms), len(embeddings)))
        
        for i, algorithm in enumerate(algorithms):
                        
            for j, embedding in enumerate(embeddings):
                
                loop = tqdm(total=epochs, position=0)
                loop.set_description(f"Trial: {trial} | Algorithm: {algorithm_names[i]} | Embedding: {embedding_names[j]}.")
                algorithm.reset()
                accuracy = algorithm.train(embedded_train_labeled[j], 
                                           embedded_train_unlabeled[j],
                                           embedded_test_data[j],
                                           epochs,
                                           loop)
                trial_accuracies[i][j] = accuracy
                
        accuracies.append(trial_accuracies)
        
    avg_acc = np.average(np.dstack(accuracies), axis=2)
    avg_embed_time = np.average(np.array(embed_times), axis=0).reshape(1, -1)
    
    pickle_results(results_dir, avg_acc, avg_embed_time, algorithm_names, embedding_names)
    display_results(results_dir)
    
run_experiments()

Here
Here
Trial: 0, Embeddings Obtained


ValueError: cannot reshape array of size 2 into shape (3,32,32)