In [39]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, ColumnDataSource, save, output_file
from bokeh.models import LabelSet, Label
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()

In [2]:
row_labels = ['A','B','C']
data = pd.DataFrame({'D1': [2,1,-6], 'D2': [5,6,-6]}, index=row_labels)
print(data)

   D1  D2
A   2   5
B   1   6
C  -6  -6


In [3]:
plt_data = pd.concat([data, pd.DataFrame({'Observation': row_labels}, index=row_labels)], axis = 1)
src = ColumnDataSource(plt_data)
p = figure(title = 'Data Without Additional Dimensions',
           x_axis_label = 'Dimension 1',
           y_axis_label = 'Dimension 2',
           x_range=(-7,3), y_range=(-7,7))
p.scatter(data['D1'], data['D2'], size=15, color='#cc0000')
labels = LabelSet(x='D1', y='D2', text='Observation', x_offset=10, y_offset=-5, source=src)
p.add_layout(labels)
show(p)

In [4]:
def dist(x,y):
    result = np.linalg.norm(x - y)
    return(result)

In [41]:
def dist_ratio(points, n_sim, n_dim, loc, stdev):
    """
    Simulate distance ratio between near and far points.
    
    Args:
        points: (dataframe) Pandas dataframe containing
            three points with `index=['A','B','C']`.
        n_sim: (int) Number of simulations to run.
        n_dim: (int) Number of dimensions to add.
        loc: (float) Location parameter of random
            normal distribution which is used to
            generate dimensions.
        stdev: (float) Standard deviation parameter of
            random normal distribution which is used to
            generate dimensions.
    Returns:
        Pandas data frame of distance ratio simulations.
        For each simulation the distance ratio is calculated
        cumulatively for each dimension.
    """
    dim_column_names = ['D' + str(i) for i in range(3,n_dim+3)]
    dist_ratio = pd.DataFrame([])
    
    for n in range(0,n_sim):
        data_wide = pd.DataFrame([np.random.normal(loc, stdev, n_dim),
                                  np.random.normal(loc, stdev, n_dim),
                                  np.random.normal(loc, stdev, n_dim)], columns=dim_column_names, index=['A','B','C'])
        data_wide = pd.concat([points, data_wide], axis=1)
        dist_a_b = np.array([])
        dist_a_c = np.array([])
        
        for i in range(1,data_wide.shape[1]+1):
            d_ab = dist(data_wide.loc['A'][:i], data_wide.loc['B'][:i])
            d_ac = dist(data_wide.loc['A'][:i], data_wide.loc['C'][:i])
            dist_a_b = np.append(dist_a_b, d_ab)
            dist_a_c = np.append(dist_a_c, d_ac)
        
        dist_ratio = pd.concat([dist_ratio, pd.DataFrame(dist_a_b / dist_a_c)], axis = 1)

    dist_ratio.columns = ['sim' + str(item) for item in range(1,dist_ratio.shape[1]+1)]
    
    return(dist_ratio)

In [42]:
dist_ratio_sim = dist_ratio(data, n_sim=500, n_dim=58, loc=0, stdev=10)

In [7]:
np.shape(dist_ratio_sim)

(60, 500)

In [43]:
hist, edges = np.histogram(dist_ratio_sim.iloc[-1,:], density=False, bins=50)
p = figure(title = 'Distribution of Distance Ratio', x_range = (0.5,1.5))
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color = "#cc0000", line_color="#ffffff")
p.xaxis.axis_label = 'signal value'
p.yaxis.axis_label = 'Frequency'
show(p)

In [24]:
def dist_ratio_plt(obj, annotation=None):
    """
    Plot distance ratio by dimensions
    
    Args:
        obj: Pandas data frame generated by dist_ratio().
        annotation: Bokeh Label annotation to apply to plot.
    Returns:
        Bokeh plot object.
    """
    p = figure(title = 'Distance Ratio as Dimensionality is Increased',
               x_axis_label = 'Dimensions',
               y_axis_label = 'Distance Ratio',
               y_range = (0,2))
    sim_sub = np.random.choice(np.arange(1,obj.shape[1]+1),
                               200,
                               replace=False)
    x_index = np.linspace(1,
                          obj.shape[0],
                          num=obj.shape[0])
    for i in sim_sub:
        p.line(x_index,
               obj['sim' + str(i)],
               color = '#000000',
               line_alpha = 0.1)
    p.line(x_index, 1, color='#cc0000', line_width = 3)
    
    if annotation:
        p.add_layout(annotation)
    
    return(p)
p = dist_ratio_plt(dist_ratio_sim)
show(p)

In [11]:
dist_ratio_sim_0 = dist_ratio(data, n_sim=500, n_dim=58, loc=0, stdev=0.5)
dist_ratio_sim_1 = dist_ratio(data, n_sim=500, n_dim=58, loc=0, stdev=1)
dist_ratio_sim_2 = dist_ratio(data, n_sim=500, n_dim=58, loc=0, stdev=3)
dist_ratio_sim_3 = dist_ratio(data, n_sim=500, n_dim=58, loc=0, stdev=5)

In [37]:
p0 = dist_ratio_plt(dist_ratio_sim_0, annotation =  Label(x=1, y=1.8, text='mean = 0\n sd = 0.5'))
p1 = dist_ratio_plt(dist_ratio_sim_1, annotation =  Label(x=1, y=1.8, text='mean = 0\n sd = 1'))
p2 = dist_ratio_plt(dist_ratio_sim_2, annotation =  Label(x=1, y=1.8, text='mean = 0\n sd = 3'))
p3 = dist_ratio_plt(dist_ratio_sim_3, annotation =  Label(x=1, y=1.8, text='mean = 0\n sd = 5'))
p_grid = gridplot([p0,p1,p2,p3], ncols=4, plot_width=500, plot_height=500)
show(p_grid)

In [40]:
output_file('figs/hds_3.html')
save(p_grid)

'/Users/iali/Desktop/Personal/website/blog-notebooks/high-dimensional-space/figs/p_grid.html'