In [1]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, save, output_file, ColumnDataSource
from bokeh.models import LabelSet
from bokeh.io import output_notebook
output_notebook()

In [2]:
row_labels = ['A','B','C']
data = pd.DataFrame({'D1': [2,1,-6], 'D2': [5,6,-6]}, index=row_labels)
print(data)

   D1  D2
A   2   5
B   1   6
C  -6  -6


In [3]:
plt_data = pd.concat([data, pd.DataFrame({'Observation': row_labels}, index=row_labels)], axis = 1)
src = ColumnDataSource(plt_data)
p = figure(title = 'Data Without Additional Dimensions',
           x_axis_label = 'Dimension 1',
           y_axis_label = 'Dimension 2',
           x_range=(-7,3), y_range=(-7,7))
p.scatter(data['D1'], data['D2'], size=15, color='#cc0000')
labels = LabelSet(x='D1', y='D2', text='Observation', x_offset=10, y_offset=-5, source=src)
p.add_layout(labels)
show(p)

In [4]:
output_file('figs/data.html')
save(p)

'/Users/iali/Desktop/Personal/website/blog-notebooks/high-dimensional-space/figs/data.html'

In [5]:
def dist(x,y):
    result = np.linalg.norm(x - y)
    return(result)

In [6]:
def dist_ratio(n_sim, n_dim, loc, stdev):
    dim_column_names = ['D' + str(i) for i in range(3,n_dim+3)]
    dist_ratio = pd.DataFrame([])
    
    for n in range(0,n_sim):
        data_wide = pd.DataFrame([np.random.normal(loc, stdev, n_dim),
                                  np.random.normal(loc, stdev, n_dim),
                                  np.random.normal(loc, stdev, n_dim)], columns=dim_column_names, index=['A','B','C'])
        data_wide = pd.concat([data, data_wide], axis=1)
        dist_a_b = np.array([])
        dist_a_c = np.array([])
        
        for i in range(1,data_wide.shape[1]+1):
            d_ab = dist(data_wide.loc['A'][:i], data_wide.loc['B'][:i])
            d_ac = dist(data_wide.loc['A'][:i], data_wide.loc['C'][:i])
            dist_a_b = np.append(dist_a_b, d_ab)
            dist_a_c = np.append(dist_a_c, d_ac)
        
        dist_ratio = pd.concat([dist_ratio, pd.DataFrame(dist_a_b / dist_a_c)], axis = 1)

    dist_ratio.columns = ['sim' + str(item) for item in range(1,dist_ratio.shape[1]+1)]
    
    return(dist_ratio)

In [7]:
dist_ratio_sim = dist_ratio(n_sim=500, n_dim=58, loc=0, stdev=10)

In [8]:
np.shape(dist_ratio_sim)

(60, 500)

In [9]:
hist, edges = np.histogram(dist_ratio_sim.iloc[-1,:], density=False, bins=50)
p = figure(title = 'Distribution of Distance Ratio', x_range = (0.5,1.5))
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color = "#cc0000", line_color="#ffffff")
p.xaxis.axis_label = 'signal value'
p.yaxis.axis_label = 'Frequency'
show(p)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [10]:
output_file('figs/distribution_distance_ratio.html')
save(p)

'/Users/iali/Desktop/Personal/website/blog-notebooks/high-dimensional-space/figs/distribution_distance_ratio.html'

In [19]:
p = figure(title = 'Distance Ratio as Dimensionality is Increased',
           x_axis_label = 'Dimensions',
           y_axis_label = 'Distance Ratio')

sim_sub = np.random.choice(np.arange(1,dist_ratio_sim.shape[1]+1),
                           200,
                           replace=False)
x_index = np.linspace(1,
                      dist_ratio_sim.shape[0],
                      num=dist_ratio_sim.shape[0])
for i in sim_sub:
    p.line(x_index,
           dist_ratio_sim['sim' + str(i)],
           color = '#000000',
           line_alpha = 0.1)
p.line(x_index, 1, color='#cc0000', line_width = 3)
show(p)

In [13]:
output_file('figs/distance_ratio_increasing_dimensions.html')
save(p)

'/Users/iali/Desktop/Personal/website/blog-notebooks/high-dimensional-space/figs/distance_ratio_increasing_dimensions.html'