In [1]:
from IPython.display import Markdown, display
import pandas as pd
import numpy as np
import math
import sklearn.preprocessing as preprocessing
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA 
from sklearn.decomposition import FactorAnalysis
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.manifold import Isomap
import time
# import umap

def printmd(string):
    display(Markdown(string))

## Function for Standardizing the Data

In [2]:
def preprocessing(data, columns_to_remove):
    '''
    columns_to_remove --->
    13_dim_data - 'label'
    32_dim_data - ['Time', 'Cell_length', 'DNA1', 'DNA2', 'Viability', 'file_number', 'event_number', 'individual', 'label']
    
    '''
    data_labeled = data[data.label.notnull()]     # Labeled data (Remove rows without Label)
    data_data = data_labeled.drop(columns_to_remove, axis = 1) # labeled data without labels
    data_labels = data_labeled['label']
    
    scaler = StandardScaler()
    
    std_data_data_array = scaler.fit_transform(data_data)
    std_data_data = pd.DataFrame(data= std_data_data_array, columns = data_data.columns).reset_index(drop=True) # standardized data
    std_data_labels = pd.DataFrame(data_labels).reset_index(drop=True)  # labels
    std_data = pd.concat([std_data_data, std_data_labels], axis=1)    # standardized data with labels
    
    return std_data_data, std_data_labels, std_data

## Functions for different Dimensionality Reduction Techniques

In [3]:
def pca(data, n_components):
    pca = PCA(n_components = n_components)
    pca.fit(data)
    data_pca = pca.transform(data)
    return pca, data_pca
        
def ica(data, n_components, max_iter=10000000):
    ica = FastICA(n_components = n_components)
    data_ica = ica.fit_transform(data)
    return data_ica
        
def fa(data, n_components):
    fa = FactorAnalysis(n_components = n_components, max_iter=5000)
    data_fa = fa.fit_transform(data)
    return fa, data_fa
        
def svd(data, n_components):
    svd = TruncatedSVD(n_components = n_components)
    data_svd = svd.fit_transform(data)
    return svd, data_svd
        
def mds(data, n_components):
    mds = MDS(n_components = n_components)
    data_mds = mds.fit_transform(data)
    return data_mds

def tsne(data, n_components):
    tsne = TSNE(n_components = n_components)
    data_tsne = tsne.fit_transform(data)
    return data_tsne

def u_map(data, n_components):
    u_map = umap.UMAP(n_components = n_components)
    data_umap = u_map.fit_transform(data)
    return data_umap

def isomap(data, n_components):
    isomap = Isomap(n_components = n_components)
    data_isomap = isomap.fit_transform(data)
    return data_isomap

## Function for calculation of nearest neighbors.
### Returns indices of nearest neighbors.

In [4]:
def nearestneighbors(data, n):
    '''
    n - number of nearest neighbors
    '''
    nbrs = NearestNeighbors(n_neighbors= n+1).fit(data)
    distances, indices = nbrs.kneighbors(data)
    return indices

## Function for calculation of fraction of neighbors that belong to same subtype as the cell. 
### Returns dataframe with 'fraction' column added in original data. 

In [5]:
def fraction (data, index_array, labels, n):
    '''
    index_array --> indices of the neighbors of each point.
    returns dataframe with fraction of neighbors that belong to same subtype as the cell.
    '''
    
    fraction =[]
    for i in range(len(index_array)):
        true_label = labels.iloc[index_array[i][0]].label  # True Label is the first element of index_array
        neighbors = labels.iloc[index_array[i][1:n+1]]  # array of indices of neighbors
        cellwise_fraction_counts = neighbors.label.value_counts(normalize=True)  # fracion of neighbors that belong to same subtype as the cell
        if (true_label in cellwise_fraction_counts.index.tolist()): 
            cellwise_fraction = cellwise_fraction_counts.loc[true_label]
        else:
            cellwise_fraction = 0
        fraction.append(cellwise_fraction)
        fraction_df = pd.DataFrame(data = fraction, columns = ['fraction'])
        data_with_fraction = pd.concat([data, labels, fraction_df], axis=1)
    return data_with_fraction

## Function for calculations of nearest neighbors and fractions for reduced dimensional data.
### Returns dataframe with 'fraction' column added in reduced dimensional data.

In [6]:
def reduced_dim_fraction(data_reduced_dim, labels_reduced_dim, n):
    '''
    input dimensionality reduction data --> eg. data_dim_red = pca(dim_13_data, 2) ----> method(data, n_components)
    returns dataframe (data + fraction)
    
    n - number of nearest neighbors
    
    labels_reduced_dim --> is same as labels of sample data
    '''
    
    data = pd.DataFrame(data_reduced_dim)
    
    # nbrs_reduced_dim = NearestNeighbors(n_neighbors=20).fit(data)
    # distances_reduced_dim, indices_reduced_dim = nbrs_reduced_dim.kneighbors(data)
    
    indices_reduced_dim = nearestneighbors(data, n+1)
    reduced_dim_with_fraction = fraction(data, indices_reduced_dim, labels_reduced_dim, n)
    # reduced_dim_with_fraction = pd.concat([data, labels_reduced_dim, fractions_reduced_dim], axis=1)
    return reduced_dim_with_fraction

## Function for NPE calculation

In [7]:
def NPE(data_fraction, dim_red_fraction, no_of_labels):
    '''
    Empirical Density Function- No. of observations lesser than or equal to the given value divided by total No. of observations.
    
    data_fraction = std_dim_13_fraction
    dim_red_fraction = data_pca_fraction /data_fa_fraction /data_ica_fraction /data_svd_fraction....
    
    '''   
    delta =[]
    for i in range(1, no_of_labels+1):
        labelwise_fractions_original = np.array(data_fraction[data_fraction['label'] == i]['fraction'])
        labelwise_fractions_dim_red = np.array(dim_red_fraction[dim_red_fraction['label'] == i]['fraction'])

        P =[]             # Empirical Density Distribution in original data
        for j in labelwise_fractions_original:
            EDF_original = labelwise_fractions_original[labelwise_fractions_original < j].size / labelwise_fractions_original.size
            P.append(EDF_original)     #EDF - Empirical Density Function
        
        Q = []            # Empirical Density Distribution in dimension-reduced space
        for k in labelwise_fractions_dim_red:
            EDF_dim_red = labelwise_fractions_dim_red[labelwise_fractions_dim_red < k].size / labelwise_fractions_dim_red.size
            Q.append(EDF_dim_red)
            
        if P:
            delta.append(max(np.subtract(P, Q)))   # delta = sup|P(a)-Q(a)|

    return sum(delta)/no_of_labels

## Function for Residual Variance Calculations of Factor Analysis (FA)

In [8]:
def variance_retained_fa(fa_sample):
    variance_explained_without_noise = []
    variance_explained_with_noise = []
    for fa in fa_sample:
        c2 = np.sum(fa.components_ ** 2, axis=1)
        total_variance_ = np.sum(c2)
        pvars_ = 100 * c2 / total_variance_
        pvars_noise_ = 100 * c2 / (total_variance_ + np.sum(fa.noise_variance_))
        variance_explained_without_noise.append(pvars_)
        variance_explained_with_noise.append(pvars_noise_)
    return variance_explained_without_noise, variance_explained_with_noise

# 13-Dimensional Levine Dataset

In [9]:
dim_13 = pd.read_csv('Levine_13dim.txt', sep = "\t", header=0)
dim_13.head()

Unnamed: 0,CD45,CD45RA,CD19,CD11b,CD4,CD8,CD34,CD20,CD33,CD123,CD38,CD90,CD3,label
0,3.138041,1.618552,0.525611,-0.10468,0.811363,1.740672,0.531365,-0.153811,4.29114,2.615224,4.64657,1.849447,1.280036,1.0
1,3.486871,2.247755,0.309583,0.833397,1.550594,1.82903,-0.033018,0.290684,4.672004,1.532877,4.253449,0.422374,0.066806,1.0
2,2.14552,0.796811,-0.116073,0.186249,0.529898,0.834876,0.734445,1.590881,4.473149,0.443484,4.511387,0.124615,-2.066902,1.0
3,4.321869,0.167016,-0.639139,-0.020982,1.814245,-0.094315,1.416341,-0.028213,4.295604,2.546383,5.0041,1.326005,-1.015953,1.0
4,2.633997,0.714859,-0.173889,-0.04241,0.450911,1.587902,0.731608,0.139234,4.1153,1.579856,5.065479,0.632904,-0.703904,1.0


In [10]:
std_dim_13_data, std_dim_13_labels, std_dim_13 = preprocessing(dim_13, 'label')

In [None]:
sample_1 = std_dim_13.sample(n = 10000,replace="False").reset_index(drop=True)
sample_2 = std_dim_13.sample(n = 10000,replace="False").reset_index(drop=True)
sample_3 = std_dim_13.sample(n = 10000,replace="False").reset_index(drop=True)

In [12]:
sample_1_data = sample_1.drop('label', axis = 1)    # Data without Labels
sample_1_labels = pd.DataFrame(sample_1['label'])   # labels

sample_2_data = sample_2.drop('label', axis = 1)    # Data without Labels
sample_2_labels = pd.DataFrame(sample_2['label'])   # labels

sample_3_data = sample_3.drop('label', axis = 1)    # Data without Labels
sample_3_labels = pd.DataFrame(sample_3['label'])   # labels

In [13]:
indices_sample_1 = nearestneighbors(sample_1_data, 20)
indices_sample_2 = nearestneighbors(sample_2_data, 20)
indices_sample_3 = nearestneighbors(sample_3_data, 20)

In [14]:
sample_1_with_fraction = fraction (sample_1_data, indices_sample_1, sample_1_labels, 20)
sample_2_with_fraction = fraction (sample_2_data, indices_sample_2, sample_2_labels, 20)
sample_3_with_fraction = fraction (sample_3_data, indices_sample_3, sample_3_labels, 20)

# Principal Component Analysis (PCA)

In [17]:
start_time_pca = time.time()
pca_sample_1,sample_1_data_pca = pca(sample_1_data, 2)
pca_sample_2,sample_2_data_pca = pca(sample_2_data, 2)
pca_sample_3,sample_3_data_pca = pca(sample_3_data, 2)
print("Computation Time for PCA =", (time.time() - start_time_pca)/3)

Computation Time for PCA = 0.01719244321187337


In [18]:
print('Variance Retained (PCA)= %.2f%%' % ((sum(pca_sample_1.explained_variance_ratio_)+ 
                                    sum(pca_sample_2.explained_variance_ratio_)+ 
                                    sum(pca_sample_3.explained_variance_ratio_))/3 * 100))

Variance Retained (PCA)= 40.61%


In [19]:
sample_1_pca_with_fraction = reduced_dim_fraction(sample_1_data_pca, sample_1_labels, 20)
sample_2_pca_with_fraction = reduced_dim_fraction(sample_2_data_pca, sample_2_labels, 20)
sample_3_pca_with_fraction = reduced_dim_fraction(sample_3_data_pca, sample_3_labels, 20)

In [20]:
NPE_sample_1_pca  = NPE(sample_1_with_fraction, sample_1_pca_with_fraction, 24)
NPE_sample_2_pca  = NPE(sample_2_with_fraction, sample_2_pca_with_fraction, 24)
NPE_sample_3_pca  = NPE(sample_3_with_fraction, sample_3_pca_with_fraction, 24)
print("NPE for PCA =", (NPE_sample_1_pca + NPE_sample_2_pca + NPE_sample_3_pca)/3)

NPE for PCA = 0.626079374768


# Factor Analysis (FA)

In [23]:
start_time_fa = time.time()
fa_sample_1,sample_1_data_fa = fa(sample_1_data, 2)
fa_sample_2,sample_2_data_fa = fa(sample_2_data, 2)
fa_sample_3,sample_3_data_fa = fa(sample_3_data, 2)
print("Computation Time for FA =", (time.time() - start_time_fa)/3)

Computation Time for FA = 3.618231693903605


In [24]:
fa_sample = [fa_sample_1, fa_sample_2, fa_sample_3]
variance_explained_without_noise, variance_explained_with_noise = variance_retained_fa(fa_sample)
print("Variance Explained without Noise =", sum(variance_explained_without_noise)/3)
print("Variance Explained with Noise =", sum(variance_explained_with_noise)/3)

Variance Explained without Noise = [ 48.73550353  51.26449647]
Variance Explained with Noise = [ 16.2531118   17.11463947]


In [25]:
sample_1_fa_with_fraction = reduced_dim_fraction(sample_1_data_fa, sample_1_labels, 20)
sample_2_fa_with_fraction = reduced_dim_fraction(sample_2_data_fa, sample_2_labels, 20)
sample_3_fa_with_fraction = reduced_dim_fraction(sample_3_data_fa, sample_3_labels, 20)

In [26]:
NPE_sample_1_fa  = NPE(sample_1_with_fraction, sample_1_fa_with_fraction, 24)
NPE_sample_2_fa  = NPE(sample_2_with_fraction, sample_2_fa_with_fraction, 24)
NPE_sample_3_fa  = NPE(sample_3_with_fraction, sample_3_fa_with_fraction, 24)
print("NPE_fa =", (NPE_sample_1_fa + NPE_sample_2_fa + NPE_sample_3_fa)/3)

NPE_fa = 0.633400257251


# Independent Component Analysis (ICA)

In [28]:
start_time_ica = time.time()
sample_1_ica = ica(sample_1_data, 2)
sample_2_ica = ica(sample_2_data, 2)
sample_3_ica = ica(sample_3_data, 2)
print("Computation Time for ICA =", (time.time() - start_time_ica)/3)

Computation Time for ICA = 0.059857447942097984


In [29]:
sample_1_ica_with_fraction = reduced_dim_fraction(sample_1_ica, sample_1_labels, 20)
sample_2_ica_with_fraction = reduced_dim_fraction(sample_2_ica, sample_2_labels, 20)
sample_3_ica_with_fraction = reduced_dim_fraction(sample_3_ica, sample_3_labels, 20)

In [30]:
NPE_sample_1_ica  = NPE(sample_1_with_fraction, sample_1_ica_with_fraction, 24)
NPE_sample_2_ica  = NPE(sample_2_with_fraction, sample_2_ica_with_fraction, 24)
NPE_sample_3_ica  = NPE(sample_3_with_fraction, sample_3_ica_with_fraction, 24)
print("NPE for ICA =", (NPE_sample_1_ica + NPE_sample_2_ica + NPE_sample_3_ica)/3)

NPE for ICA = 0.624805565683


# Singular Value Decomposition (SVD)

In [31]:
start_time_svd = time.time()
svd_sample_1,sample_1_data_svd = svd(sample_1_data, 2)
svd_sample_2,sample_2_data_svd = svd(sample_2_data, 2)
svd_sample_3,sample_3_data_svd = svd(sample_3_data, 2)
print("Computation Time for SVD =", (time.time() - start_time_svd)/3)

Computation Time for SVD = 0.019582748413085938


In [32]:
print('Variance Retained (SVD)= %.2f%%' % ((sum(svd_sample_1.explained_variance_ratio_)+ 
                                    sum(svd_sample_2.explained_variance_ratio_)+ 
                                    sum(svd_sample_3.explained_variance_ratio_))/3 * 100))

Variance Retained (SVD)= 40.61%


In [33]:
sample_1_svd_with_fraction = reduced_dim_fraction(sample_1_data_svd, sample_1_labels, 20)
sample_2_svd_with_fraction = reduced_dim_fraction(sample_2_data_svd, sample_2_labels, 20)
sample_3_svd_with_fraction = reduced_dim_fraction(sample_3_data_svd, sample_3_labels, 20)

In [34]:
NPE_sample_1_svd  = NPE(sample_1_with_fraction, sample_1_svd_with_fraction, 24)
NPE_sample_2_svd  = NPE(sample_2_with_fraction, sample_2_svd_with_fraction, 24)
NPE_sample_3_svd  = NPE(sample_3_with_fraction, sample_3_svd_with_fraction, 24)
print("NPE for SVD =", (NPE_sample_1_svd + NPE_sample_2_svd + NPE_sample_3_svd)/3)

NPE for SVD = 0.626079374768


# t-distributed Stochastic Neighbor Embedding (t-SNE)

In [35]:
start_time_tsne = time.time()
sample_1_tsne = tsne(sample_1_data, 2)
sample_2_tsne = tsne(sample_2_data, 2)
sample_3_tsne = tsne(sample_3_data, 2)
print("Computation Time for t-SNE =", (time.time() - start_time_tsne)/3)

Computation Time for t-SNE = 235.31963666280112


In [36]:
sample_1_tsne_with_fraction = reduced_dim_fraction(sample_1_tsne, sample_1_labels, 20)
sample_2_tsne_with_fraction = reduced_dim_fraction(sample_2_tsne, sample_2_labels, 20)
sample_3_tsne_with_fraction = reduced_dim_fraction(sample_3_tsne, sample_3_labels, 20)

In [37]:
NPE_sample_1_tsne  = NPE(sample_1_with_fraction, sample_1_tsne_with_fraction, 24)
NPE_sample_2_tsne  = NPE(sample_2_with_fraction, sample_2_tsne_with_fraction, 24)
NPE_sample_3_tsne  = NPE(sample_3_with_fraction, sample_3_tsne_with_fraction, 24)
print("NPE for t-SNE =", (NPE_sample_1_tsne + NPE_sample_2_tsne + NPE_sample_3_tsne)/3)

NPE for t-SNE = 0.422828474498


# Uniform Manifold Approximation and Projection (UMAP)

In [None]:
import sys
!{sys.executable} -m pip install umap-learn

In [None]:
start_time_umap = time.time()
sample_1_umap = u_map(sample_1_data, 2)
sample_2_umap = u_map(sample_2_data, 2)
sample_3_umap = u_map(sample_3_data, 2)
print("Computation Time for UMAP =", (time.time() - start_time_umap)/3)

In [44]:
sample_1_umap_with_fraction = reduced_dim_fraction(sample_1_umap, sample_1_labels, 20)
sample_2_umap_with_fraction = reduced_dim_fraction(sample_2_umap, sample_2_labels, 20)
sample_3_umap_with_fraction = reduced_dim_fraction(sample_3_umap, sample_3_labels, 20)

In [45]:
NPE_sample_1_umap  = NPE(sample_1_with_fraction, sample_1_umap_with_fraction, 24)
NPE_sample_2_umap  = NPE(sample_2_with_fraction, sample_2_umap_with_fraction, 24)
NPE_sample_3_umap  = NPE(sample_3_with_fraction, sample_3_umap_with_fraction, 24)
print("NPE for UMAP =", (NPE_sample_1_umap + NPE_sample_2_umap + NPE_sample_3_umap)/3)

NPE for UMAP = 0.4767775696408303


# Isomap

In [38]:
start_time_isomap = time.time()
sample_1_isomap = isomap(sample_1_data, 2)
sample_2_isomap = isomap(sample_2_data, 2)
sample_3_isomap = isomap(sample_3_data, 2)
print("Computation Time for Isomap =", (time.time() - start_time_isomap)/3)

Computation Time for Isomap = 59.891643603642784


In [39]:
sample_1_isomap_with_fraction = reduced_dim_fraction(sample_1_isomap, sample_1_labels, 20)
sample_2_isomap_with_fraction = reduced_dim_fraction(sample_2_isomap, sample_2_labels, 20)
sample_3_isomap_with_fraction = reduced_dim_fraction(sample_3_isomap, sample_3_labels, 20)

In [40]:
NPE_sample_1_isomap  = NPE(sample_1_with_fraction, sample_1_isomap_with_fraction, 24)
NPE_sample_2_isomap  = NPE(sample_2_with_fraction, sample_2_isomap_with_fraction, 24)
NPE_sample_3_isomap  = NPE(sample_3_with_fraction, sample_3_isomap_with_fraction, 24)
print("NPE for Isomap =", (NPE_sample_1_isomap + NPE_sample_2_isomap + NPE_sample_3_isomap)/3)

NPE for Isomap = 0.596220640042


# MultiDimensional Scaling (MDS)

In [41]:
start_time_mds = time.time()
sample_1_mds = mds(sample_1_data, 2)
sample_2_mds = mds(sample_2_data, 2)
sample_3_mds = mds(sample_3_data, 2)
print("Computation Time for MSD =", (time.time() - start_time_mds)/3)

Computation Time for MSD = 2984.7267775535583


In [42]:
sample_1_mds_with_fraction = reduced_dim_fraction(sample_1_mds, sample_1_labels, 20)
sample_2_mds_with_fraction = reduced_dim_fraction(sample_2_mds, sample_2_labels, 20)
sample_3_mds_with_fraction = reduced_dim_fraction(sample_3_mds, sample_3_labels, 20)

In [43]:
NPE_sample_1_mds  = NPE(sample_1_with_fraction, sample_1_mds_with_fraction, 24)
NPE_sample_2_mds  = NPE(sample_2_with_fraction, sample_2_mds_with_fraction, 24)
NPE_sample_3_mds  = NPE(sample_3_with_fraction, sample_3_mds_with_fraction, 24)
print("NPE for MDS =", (NPE_sample_1_mds + NPE_sample_2_mds + NPE_sample_3_mds)/3)

NPE for MDS = 0.600846387766


# 32-Dimensional Levine Dataset

In [9]:
dim_32 = pd.read_csv('Levine_32dim.txt', sep = "\t", header=0)
dim_32.head()

Unnamed: 0,Time,Cell_length,DNA1,DNA2,CD45RA,CD133,CD19,CD22,CD11b,CD4,...,CD117,CD49d,HLA-DR,CD64,CD41,Viability,file_number,event_number,label,individual
0,1024,22,4.391057,4.617262,0.162691,-0.029585,-0.006696,0.066388,-0.009184,0.363602,...,0.05305,0.853505,1.66448,-0.005376,-0.001961,0.648429,3.627711,307.0,1.0,1.0
1,1024,35,4.340481,4.816692,0.701348,-0.03828,-0.016654,0.074409,0.808031,-0.035424,...,0.08966,0.197818,0.491592,0.144814,0.868014,0.561384,3.627711,545.0,1.0,1.0
2,1024,32,3.838727,4.386369,0.603568,-0.032216,0.073855,-0.042977,-0.001881,-0.008781,...,0.046222,2.58667,1.308337,-0.010961,-0.010413,0.643337,3.627711,1024.0,1.0,1.0
3,1024,29,4.255805,4.830048,0.433747,-0.027611,-0.017661,-0.044072,0.733698,-0.019066,...,0.06647,1.338669,0.140523,-0.013449,-0.026039,-0.026523,3.627711,1024.0,1.0,1.0
4,1024,25,3.976909,4.506433,-0.008809,-0.030297,0.080423,0.495791,1.107627,0.552746,...,-0.006223,0.180924,0.197332,0.076167,-0.040488,0.283287,3.627711,1024.0,1.0,1.0


In [10]:
std_dim_32_data, std_dim_32_labels, std_dim_32 = preprocessing(dim_32, ['Time', 'Cell_length', 'DNA1', 'DNA2', 'Viability', 'file_number', 'event_number', 'individual', 'label'])

In [11]:
sample_4 = std_dim_32.sample(n = 10000,replace="False").reset_index(drop=True)
sample_5 = std_dim_32.sample(n = 10000,replace="False").reset_index(drop=True)
sample_6 = std_dim_32.sample(n = 10000,replace="False").reset_index(drop=True)

In [12]:
sample_4_data = sample_4.drop('label', axis = 1)    # Data without Labels
sample_4_labels = pd.DataFrame(sample_4['label'])   # labels

sample_5_data = sample_5.drop('label', axis = 1)    # Data without Labels
sample_5_labels = pd.DataFrame(sample_5['label'])   # labels

sample_6_data = sample_6.drop('label', axis = 1)    # Data without Labels
sample_6_labels = pd.DataFrame(sample_6['label'])   # labels

In [13]:
indices_sample_4 = nearestneighbors(sample_4_data, 20)
indices_sample_5 = nearestneighbors(sample_5_data, 20)
indices_sample_6 = nearestneighbors(sample_6_data, 20)

In [14]:
sample_4_with_fraction = fraction (sample_4_data, indices_sample_4, sample_4_labels, 20)
sample_5_with_fraction = fraction (sample_5_data, indices_sample_5, sample_5_labels, 20)
sample_6_with_fraction = fraction (sample_6_data, indices_sample_6, sample_6_labels, 20)

# Principal Component Analysis (PCA)

In [16]:
start_time_pca = time.time()
pca_sample_4,sample_4_data_pca = pca(sample_4_data, 2)
pca_sample_5,sample_5_data_pca = pca(sample_5_data, 2)
pca_sample_6,sample_6_data_pca = pca(sample_6_data, 2)
print("Computation Time for PCA =", (time.time() - start_time_pca)/3)

Computation Time for PCA = 0.025269826253255207


In [17]:
print('Variance Retained (PCA)= %.2f%%' % ((sum(pca_sample_4.explained_variance_ratio_)+ 
                                    sum(pca_sample_5.explained_variance_ratio_)+ 
                                    sum(pca_sample_6.explained_variance_ratio_))/3 * 100))

Variance Retained (PCA)= 31.03%


In [18]:
sample_4_pca_with_fraction = reduced_dim_fraction(sample_4_data_pca, sample_4_labels, 20)
sample_5_pca_with_fraction = reduced_dim_fraction(sample_5_data_pca, sample_5_labels, 20)
sample_6_pca_with_fraction = reduced_dim_fraction(sample_6_data_pca, sample_6_labels, 20)

In [19]:
NPE_sample_4_pca  = NPE(sample_4_with_fraction, sample_4_pca_with_fraction, 14)
NPE_sample_5_pca  = NPE(sample_5_with_fraction, sample_5_pca_with_fraction, 14)
NPE_sample_6_pca  = NPE(sample_6_with_fraction, sample_6_pca_with_fraction, 14)
print("NPE for PCA =", (NPE_sample_4_pca + NPE_sample_5_pca + NPE_sample_6_pca)/3)

NPE for PCA = 0.535617880499


# Factor Analysis (FA)

In [20]:
start_time_fa = time.time()
fa_sample_4,sample_4_data_fa = fa(sample_4_data, 2)
fa_sample_5,sample_5_data_fa = fa(sample_5_data, 2)
fa_sample_6,sample_6_data_fa = fa(sample_6_data, 2)
print("Computation Time for FA =", (time.time() - start_time_fa)/3)

Computation Time for FA = 0.2081762154897054


In [21]:
fa_sample = [fa_sample_4, fa_sample_5, fa_sample_6]
variance_explained_without_noise, variance_explained_with_noise = variance_retained_fa(fa_sample)
print("Variance Explained without Noise =", sum(variance_explained_without_noise)/3)
print("Variance Explained with Noise =", sum(variance_explained_with_noise)/3)

Variance Explained without Noise = [ 57.25960526  42.74039474]
Variance Explained with Noise = [ 15.85829715  11.83728431]


In [22]:
sample_4_fa_with_fraction = reduced_dim_fraction(sample_4_data_fa, sample_4_labels, 20)
sample_5_fa_with_fraction = reduced_dim_fraction(sample_5_data_fa, sample_5_labels, 20)
sample_6_fa_with_fraction = reduced_dim_fraction(sample_6_data_fa, sample_6_labels, 20)

In [23]:
NPE_sample_4_fa  = NPE(sample_4_with_fraction, sample_4_fa_with_fraction, 14)
NPE_sample_5_fa  = NPE(sample_5_with_fraction, sample_5_fa_with_fraction, 14)
NPE_sample_6_fa  = NPE(sample_6_with_fraction, sample_6_fa_with_fraction, 14)
print("NPE_fa =", (NPE_sample_4_fa + NPE_sample_5_fa + NPE_sample_6_fa)/3)

NPE_fa = 0.525437124357


# Independent Component Analysis (ICA)

In [24]:
start_time_ica = time.time()
sample_4_ica = ica(sample_4_data, 2)
sample_5_ica = ica(sample_5_data, 2)
sample_6_ica = ica(sample_6_data, 2)
print("Computation Time for ICA =", (time.time() - start_time_ica)/3)

Computation Time for ICA = 0.05396040280659994


In [25]:
sample_4_ica_with_fraction = reduced_dim_fraction(sample_4_ica, sample_4_labels, 20)
sample_5_ica_with_fraction = reduced_dim_fraction(sample_5_ica, sample_5_labels, 20)
sample_6_ica_with_fraction = reduced_dim_fraction(sample_6_ica, sample_6_labels, 20)

In [26]:
NPE_sample_4_ica  = NPE(sample_4_with_fraction, sample_4_ica_with_fraction, 14)
NPE_sample_5_ica  = NPE(sample_5_with_fraction, sample_5_ica_with_fraction, 14)
NPE_sample_6_ica  = NPE(sample_6_with_fraction, sample_6_ica_with_fraction, 14)
print("NPE for ICA =", (NPE_sample_4_ica + NPE_sample_5_ica + NPE_sample_6_ica)/3)

NPE for ICA = 0.534539310622


# Singular Value Decomposition (SVD)

In [27]:
start_time_svd = time.time()
svd_sample_4,sample_4_data_svd = svd(sample_4_data, 2)
svd_sample_5,sample_5_data_svd = svd(sample_5_data, 2)
svd_sample_6,sample_6_data_svd = svd(sample_6_data, 2)
print("Computation Time for SVD =", (time.time() - start_time_svd)/3)

Computation Time for SVD = 0.021091461181640625


In [28]:
print('Variance Retained (SVD)= %.2f%%' % ((sum(svd_sample_4.explained_variance_ratio_)+ 
                                    sum(svd_sample_5.explained_variance_ratio_)+ 
                                    sum(svd_sample_6.explained_variance_ratio_))/3 * 100))

Variance Retained (SVD)= 31.03%


In [29]:
sample_4_svd_with_fraction = reduced_dim_fraction(sample_4_data_svd, sample_4_labels, 20)
sample_5_svd_with_fraction = reduced_dim_fraction(sample_5_data_svd, sample_5_labels, 20)
sample_6_svd_with_fraction = reduced_dim_fraction(sample_6_data_svd, sample_6_labels, 20)

In [30]:
NPE_sample_4_svd  = NPE(sample_4_with_fraction, sample_4_svd_with_fraction, 14)
NPE_sample_5_svd  = NPE(sample_5_with_fraction, sample_5_svd_with_fraction, 14)
NPE_sample_6_svd  = NPE(sample_6_with_fraction, sample_6_svd_with_fraction, 14)
print("NPE for SVD =", (NPE_sample_4_svd + NPE_sample_5_svd + NPE_sample_6_svd)/3)

NPE for SVD = 0.535543242494


# t-distributed Stochastic Neighbor Embedding (t-SNE)

In [31]:
start_time_tsne = time.time()
sample_4_tsne = tsne(sample_4_data, 2)
sample_5_tsne = tsne(sample_5_data, 2)
sample_6_tsne = tsne(sample_6_data, 2)
print("Computation Time for t-SNE =", (time.time() - start_time_tsne)/3)

Computation Time for t-SNE = 243.33090289433798


In [32]:
sample_4_tsne_with_fraction = reduced_dim_fraction(sample_4_tsne, sample_4_labels, 20)
sample_5_tsne_with_fraction = reduced_dim_fraction(sample_5_tsne, sample_5_labels, 20)
sample_6_tsne_with_fraction = reduced_dim_fraction(sample_6_tsne, sample_6_labels, 20)

In [33]:
NPE_sample_4_tsne  = NPE(sample_4_with_fraction, sample_4_tsne_with_fraction, 14)
NPE_sample_5_tsne  = NPE(sample_5_with_fraction, sample_5_tsne_with_fraction, 14)
NPE_sample_6_tsne  = NPE(sample_6_with_fraction, sample_6_tsne_with_fraction, 14)
print("NPE for t-SNE =", (NPE_sample_4_tsne + NPE_sample_5_tsne + NPE_sample_6_tsne)/3)

NPE for t-SNE = 0.393244774183


# Uniform Manifold Approximation and Projection (UMAP)

In [70]:
start_time_umap = time.time()
sample_4_umap = u_map(sample_4_data, 2)
sample_5_umap = u_map(sample_5_data, 2)
sample_6_umap = u_map(sample_6_data, 2)
print("Computation Time for UMAP =", (time.time() - start_time_umap)/3)

Computation Time for UMAP = 18.06250063578288


In [71]:
sample_4_umap_with_fraction = reduced_dim_fraction(sample_4_umap, sample_4_labels, 20)
sample_5_umap_with_fraction = reduced_dim_fraction(sample_5_umap, sample_5_labels, 20)
sample_6_umap_with_fraction = reduced_dim_fraction(sample_6_umap, sample_6_labels, 20)

In [72]:
NPE_sample_4_umap  = NPE(sample_4_with_fraction, sample_4_umap_with_fraction, 14)
NPE_sample_5_umap  = NPE(sample_5_with_fraction, sample_5_umap_with_fraction, 14)
NPE_sample_6_umap  = NPE(sample_6_with_fraction, sample_6_umap_with_fraction, 14)
print("NPE for UMAP =", (NPE_sample_4_umap + NPE_sample_5_umap + NPE_sample_6_umap)/3)

NPE for UMAP = 0.5494555777733211


# Isomap

In [34]:
start_time_isomap = time.time()
sample_4_isomap = isomap(sample_4_data, 2)
sample_5_isomap = isomap(sample_5_data, 2)
sample_6_isomap = isomap(sample_6_data, 2)
print("Computation Time for Isomap =", (time.time() - start_time_isomap)/3)

Computation Time for Isomap = 65.50609143575032


In [35]:
sample_4_isomap_with_fraction = reduced_dim_fraction(sample_4_isomap, sample_4_labels, 20)
sample_5_isomap_with_fraction = reduced_dim_fraction(sample_5_isomap, sample_5_labels, 20)
sample_6_isomap_with_fraction = reduced_dim_fraction(sample_6_isomap, sample_6_labels, 20)

In [36]:
NPE_sample_4_isomap  = NPE(sample_4_with_fraction, sample_4_isomap_with_fraction, 14)
NPE_sample_5_isomap  = NPE(sample_5_with_fraction, sample_5_isomap_with_fraction, 14)
NPE_sample_6_isomap  = NPE(sample_6_with_fraction, sample_6_isomap_with_fraction, 14)
print("NPE for Isomap =", (NPE_sample_4_isomap + NPE_sample_5_isomap + NPE_sample_6_isomap)/3)

NPE for Isomap = 0.46377900475


# MultiDimensional Scaling (MDS)

In [37]:
start_time_mds = time.time()
sample_4_mds = mds(sample_4_data, 2)
sample_5_mds = mds(sample_5_data, 2)
sample_6_mds = mds(sample_6_data, 2)
print("Computation Time for MDS =", (time.time() - start_time_mds)/3)

Computation Time for MDS = 2992.5078539053598


In [38]:
sample_4_mds_with_fraction = reduced_dim_fraction(sample_4_mds, sample_4_labels, 20)
sample_5_mds_with_fraction = reduced_dim_fraction(sample_5_mds, sample_5_labels, 20)
sample_6_mds_with_fraction = reduced_dim_fraction(sample_6_mds, sample_6_labels, 20)

In [39]:
NPE_sample_4_mds  = NPE(sample_4_with_fraction, sample_4_mds_with_fraction, 14)
NPE_sample_5_mds  = NPE(sample_5_with_fraction, sample_5_mds_with_fraction, 14)
NPE_sample_6_mds  = NPE(sample_6_with_fraction, sample_6_mds_with_fraction, 14)
print("NPE for MDS =", (NPE_sample_4_mds + NPE_sample_5_mds + NPE_sample_6_mds)/3)

NPE for MDS = 0.529860442356
