In [1]:

import numpy as np
import plotly
import plotly.graph_objs as go
import torch

import torch.nn as nn

from plotly.graph_objs import *

In [2]:
from scipy.special import iv

In [3]:
embmat = torch.load('../data/distilbert_embedding_matrix.pt')

In [4]:
embmat.norm(dim=-1).std()

tensor(0.2599)

In [5]:
def C(kappa, p):
  num = kappa ** (0.5*p - 1)
  denom = (2 * np.pi) ** (0.5*p) * iv(0.5*p - 1, kappa)
  normalization_const = num/denom
  return normalization_const


In [6]:
def vmf(x, mu, kappa):
    """Adapted from https://github.com/lmc2179/von_mises_fisher"""
    normalization_const = C(kappa, 100)
    likelihood = np.exp(kappa * (x@mu))
    return normalization_const * likelihood


In [7]:
def vmf2(x, mu, kappa):
    """Adapted from https://github.com/lmc2179/von_mises_fisher"""
    normalization_const = C(kappa, 100)
    likelihood = np.exp((x@mu))
    return normalization_const * likelihood


In [8]:
def vmf_reg2(x, mu, kappa):
    """Adapted from https://github.com/lmc2179/von_mises_fisher"""
    normalization_const = C(kappa, mu.shape[-1])
    likelihood = np.exp(.1*(x@mu))
    return normalization_const * likelihood


In [9]:
def vmf_reg12(x, mu, kappa, r1, r2):
  """Adapted from https://github.com/lmc2179/von_mises_fisher"""
  normalization_const = C(kappa, mu.shape[-1])
  likelihood = np.exp(r2*(x@mu))
  reg_term = np.exp(-1*r1 * kappa)
  return normalization_const * likelihood * reg_term


In [10]:
def vmf_simp(x, mu, kappa):
    """Adapted from https://github.com/lmc2179/von_mises_fisher"""
    normalization_const = kappa
    likelihood = np.exp((x@mu))
    return normalization_const * likelihood


In [11]:
def vmf_simp_loss(out, targets):
  normalization_const = out.norm(dim=-1)
  # batchwise matrix multiply
  likelihood = out.unsqueeze(1)@targets.view(-1,targets.shape[-1], 1)
  return (-torch.log(normalization_const) - likelihood.squeeze()).mean()

In [12]:
emb = torch.tensor([[1,1,3.0],
                    [1,1,1],
                     [-1,0,1],
                    [1,0,1],]
)
b  = torch.tensor([[1,1,1.0], [2,1,1.0]])
labels = torch.tensor([1,3])                   

In [13]:
def vmf_nearest_neighbor(out, labels, embeddings):
  print(out.shape, labels.shape, embeddings.shape)
  pass

In [14]:
vmf_nearest_neighbor(b, labels, emb)

torch.Size([2, 3]) torch.Size([2]) torch.Size([4, 3])


In [15]:
def vmf_reg1(x, mu, kappa):
    """Adapted from https://github.com/lmc2179/von_mises_fisher"""
    normalization_const = C(kappa, mu.shape[-1])
    likelihood = np.exp((x@mu))
    reg_term = np.exp(-.02 * kappa)
    return normalization_const * likelihood * reg_term


In [16]:
def vmf_normed(out, target):
  return -1* vmf(out, target, np.linalg.norm(out, axis=-1))

In [17]:
def vmf_normed_2(out, target):
  return -1* vmf2(out, target, np.linalg.norm(out, axis=-1))

In [18]:
def vmf_normed_simp(out, target):
  return -1* vmf_simp(out, target, np.linalg.norm(out, axis=-1))

In [19]:
def vmf_normed_reg1(out, target):
  return -1* vmf_reg1(out, target, np.linalg.norm(out, axis=-1))

In [20]:
def vmf_normed_reg2(out, target):
  return -1* vmf_reg2(out, target, np.linalg.norm(out, axis=-1))

In [21]:
def make_vmf_normed_reg12(r1, r2):
  def vmf_normed_reg12(out, target):
    return -1 * vmf_reg12(out, target, np.linalg.norm(out, axis=-1), r1, r2)
  return vmf_normed_reg12

In [22]:
x = np.array([[[3,0,0,], [3,0,0.0]], [[3,0,0,], [3,0,0.0]]])
mu = np.array([1,0,0.0])
x@mu

array([[3., 3.],
       [3., 3.]])

In [23]:
vmf(x, mu, 2)

array([[1.66979802e+40, 1.66979802e+40],
       [1.66979802e+40, 1.66979802e+40]])

In [24]:
# wanna know- is this gonna get too big? yes

def NLLvMF(output, target_embedding):
  return -torch.log(C(output.norm(dim=1), output.size(-1))) - output@target_embedding
def NLLvMF(output, target_embedding, kappa):
  return -torch.log(C(kappa, output.size(-1))) - output@target_embedding
def NIP(output, target_embedding, kappa):
  return - output@target_embedding

In [25]:
a = torch.tensor([1,1,1,1]).float()*2
b = torch.tensor([1,1,1,1]).float()

In [26]:
NLLvMF(a, b, torch.tensor([1.0]))

tensor([-4.8949])

In [27]:
import numpy as np


In [28]:
torch.nn.Embedding(100,2048).weight.data.norm(dim=-1)

tensor([46.2945, 45.4991, 44.1308, 44.1101, 45.7204, 44.1895, 45.0634, 45.0778,
        45.3733, 45.4447, 44.5835, 45.4047, 45.9627, 44.6528, 45.3276, 45.2534,
        44.7590, 45.1097, 44.6798, 44.8931, 45.0445, 45.2317, 44.2297, 45.4608,
        45.4323, 45.6521, 45.2357, 45.1590, 46.0943, 45.7700, 46.1047, 45.3161,
        44.5883, 44.2191, 45.9919, 45.9943, 45.2396, 46.2324, 45.1179, 44.4526,
        44.8375, 45.2442, 44.9563, 46.7675, 44.2113, 46.0685, 45.2843, 44.5964,
        43.7856, 44.8012, 44.8939, 43.9868, 46.0875, 44.9208, 45.3424, 44.6074,
        45.2221, 44.5984, 43.8992, 46.5167, 44.1791, 46.4937, 44.5682, 44.6362,
        43.5007, 44.8528, 44.6755, 46.4536, 43.9964, 45.8595, 46.2167, 44.5324,
        45.0965, 45.1348, 44.5497, 45.6031, 45.2653, 44.4634, 44.9971, 45.1542,
        45.0001, 44.7498, 45.9787, 44.7974, 44.7860, 45.8248, 44.6784, 46.1042,
        44.4407, 44.2012, 44.4961, 45.7103, 45.5151, 46.0486, 44.0336, 45.4745,
        44.0815, 43.7701, 45.7484, 45.59

In [29]:
theta_x = np.linspace(-2*np.pi, 2*np.pi, 500)
r = np.linspace(-5.8,5.8,500)

In [30]:
def convert_spherical_to_cartesian_2d(rho, theta):
    x = rho * np.cos(theta)
    y = rho * np.sin(theta)
    return x, y

In [31]:
points = []
for theta in theta_x:
  for rad in r:
    x,y = convert_spherical_to_cartesian_2d(rad, theta)
    points.append(np.array([x, y]))

In [32]:
points = np.stack(points).reshape(len(theta_x),len(r),2)

In [33]:
target_rad = 1.6
target_theta = 0.0
target_point = np.array(convert_spherical_to_cartesian_2d(target_rad, target_theta))

In [34]:
def MSE(out, target):
  # out is a nXnX2 and target is 2
  dif_sq = (out - target)**2
  s = dif_sq.sum(-1)
  return s

In [35]:
def sq_MSE(out, target):
  return np.sqrt(MSE(out, target))

In [36]:
(points@target_point).shape

(500, 500)

In [37]:
def inner_product(out, target):
  print(out.shape, target.shape)
  return -1*out@target

In [38]:
def cosine_similarity(out, target):
  ip = out@target
  p1 = (ip)/np.linalg.norm(out, axis=-1)
  return p1/np.linalg.norm(target, axis=-1)

In [39]:
def cosine_similarity_acc(outputs, labels, embedding_matrix):
  out_norm = outputs.norm(dim=-1)
  emb_norm = embedding_matrix.norm(dim=-1)
  inner_prods = outputs@embedding_matrix.T
  n1 = inner_prods/out_norm.unsqueeze(1)
  n2 = n1/emb_norm

  preds = n2.argmax(-1)
  right = labels==preds
  return right.float().mean()

In [113]:
def log_cos_loss(out, labels, embedding_matrix):
  targets = embedding_matrix[labels]
  pred_normalize = out.norm(dim=-1)
  target_normalize = targets.norm(dim=-1)
  # batchwise matrix multiply
  inner_prod = out.unsqueeze(1)@targets.view(-1,targets.shape[-1], 1)
  normed = inner_prod.squeeze()/(pred_normalize*target_normalize)
  return (-1*normed).mean()


In [107]:
rp = torch.randperm(len(embmat))

In [108]:
r = rp[:4].cuda()

In [109]:
embs = embmat[r].cuda()

In [110]:
r[0]=4000

In [111]:
embs

tensor([[-0.1232, -0.1259, -0.1238,  ..., -0.0844, -0.0649,  0.0061],
        [-0.0005, -0.0546, -0.0762,  ..., -0.0843, -0.0570, -0.0242],
        [-0.0373, -0.0683, -0.0105,  ..., -0.0162, -0.0273, -0.0354],
        [-0.0971, -0.0339,  0.0378,  ..., -0.0814, -0.0091, -0.0677]],
       device='cuda:0')

In [112]:
log_cos_loss(embs, r, embmat.cuda())

tensor(-0.8691, device='cuda:0')

In [69]:
points.shape, target_point.shape

((500, 500, 2), (2,))

In [638]:
mse_points = MSE(points, target_point)

In [639]:
mse_points.shape

(500, 500)

In [640]:
np.gradient(mse_points)[0].shape

(500, 500)

In [647]:
def plot(points, target_point, fn):
  dist_points = fn(points, target_point)
  grads_points = np.abs(np.gradient(dist_points)[1])
  data = go.Surface(
          x=r,
          y=theta_x,
          z=dist_points,
    surfacecolor=grads_points,
 showscale=True,
      )
  layout = go.Layout(
      title=str(fn).split()[1] + " colored by grad wrt norm",
      autosize=False,
      width=1200,
      height=1000,
    scene={'xaxis_title':'norm',
           'yaxis_title':'theta',
           'aspectmode':'cube',
           'aspectratio': {'x':1,'y':3}}
  )
  p = np.unravel_index(dist_points.argmin(), dist_points.shape)
  minp = {'type':'scatter3d',
          'x':[r[p[1]]],
          'y':[theta_x[p[0]]],
          'z':[dist_points[p]],}
  fig = go.Figure(data=[data, minp], layout=layout)
  plotly.offline.plot(fig, filename='/home/tucker/Downloads/sphere.html')


  print('min_index', p)
  print('min point', points[p])
  print('min r the', r[p[1]], theta_x[p[0]])
  print('min dist:', dist_points[p])
  print('min grad:', grads_points.min())
  print('max grad:', grads_points.max())  

In [558]:
plot(points, target_point, MSE)

min_index (250, 155)
min point [1.00158336 0.00630583]
min r the 1.0016032064128255 0.006295776860901103
min dist: 4.227049193364066e-05
min grad: 3.177194153880514e-07
max grad: 0.12591158614034548


In [559]:
plot(points, target_point, sq_MSE)

min_index (250, 155)
min point [1.00158336 0.00630583]
min r the 1.0016032064128255 0.006295776860901103
min dist: 0.006501576111501015
min grad: 1.585451427921214e-07
max grad: 0.012591220998056762


In [560]:
plot(points, target_point, inner_product)

(500, 500, 2) (2,)


min_index (249, 499)
min point [ 4.99990091 -0.03147868]
min r the 5.0 -0.006295776860901547
min dist: -4.999900908311601
min grad: 1.588597076860026e-07
max grad: 0.06295579307017342


In [561]:
plot(points, target_point, cosine_similarity)

min_index (0, 0)
min point [8.00000000e-01 9.79717439e-17]
min r the -0.8 -3.141592653589793
min dist: -1.0
min grad: 7.927099413518324e-05
max grad: 0.012591158614034686


In [562]:
plot_grad(points, target_point, cosine_similarity)

min_index (125, 80)
min point [ 0.00040878 -0.12985908]
min r the 0.12985971943887775 -1.567648438364446
min dist: -0.012591158614034686


In [563]:
plot(points, target_point, vmf_normed)

min_index (249, 499)
min point [ 4.99990091 -0.03147868]
min r the 5.0 -0.006295776860901547
min dist: -2.6823016447673724e+48
min grad: 1.0268145939661264e+24
max grad: 1.017836965078111e+47


In [564]:
# This is the unregularized version from the paper (it excluded the kappa term (norm of e) in the exponent/innerproduct term
plot(points, target_point, vmf_normed_2)

min_index (249, 499)
min point [ 4.99990091 -0.03147868]
min r the 5.0 -0.006295776860901547
min dist: -5.530827553339808e+39
min grad: 6.694733599657165e+30
max grad: 9.209360494997154e+37


In [565]:
plot_grad(points, target_point, vmf_normed_2)

min_index (215, 499)
min point [ 4.53559552 -2.10437004]
min r the 5.0 -0.4344086034021961
min dist: -9.209360494997154e+37


In [566]:
plot(points, target_point, vmf_normed_simp)

min_index (249, 499)
min point [ 4.99990091 -0.03147868]
min r the 5.0 -0.006295776860901547
min dist: -741.9922666034148
min grad: 3.1772510013586275e-10
max grad: 12.354885777490182


In [567]:
plot(points, target_point, vmf_normed_reg1)

min_index (249, 499)
min point [ 4.99990091 -0.03147868]
min r the 5.0 -0.006295776860901547
min dist: -0.7845398323424346
min grad: 1.4121869774326207e-08
max grad: 0.013063343720350146


In [568]:
plot(points, target_point, vmf_normed_reg2)

min_index (249, 86)
min point [ 0.19959524 -0.00125662]
min r the 0.19959919839679352 -0.006295776860901547
min dist: -0.16075837604904103
min grad: 2.5278717452614785e-09
max grad: 0.00018561328788017917


In [648]:
plot(points, target_point, make_vmf_normed_reg12(1, 1))

min_index (0, 315)
min point [1.52264529e+00 3.72940536e-16]
min r the 1.5226452905811625 -6.283185307179586
min dist: -0.237725203688139
min grad: 6.816203498781849e-11
max grad: 0.008846192947442016


In [70]:
#just a sphere
size = 200
theta = np.linspace(0,2*np.pi,size)
phi = np.linspace(0,np.pi,size)
x = np.outer(np.cos(theta),np.sin(phi))
y = np.outer(np.sin(theta),np.sin(phi))
z = np.outer(np.ones(size),np.cos(phi))  # note this is 2d now


In [86]:
points = np.stack([x,y,z], 2)

In [87]:
target = np.array([1.0, 1,1])

In [88]:
points.shape, target.shape

((200, 200, 3), (3,))

In [91]:
(points@target).shape

(200, 200)

In [108]:
color = cosine_similarity(points, np.array([1.0, 1,1]))

In [105]:
color = np.exp(cosine_similarity(points, np.array([1.0, 1,1])))

In [109]:

data = go.Surface(
        x=x,
        y=y,
        z=z,
  cmin=0, cmax=3,
  surfacecolor=color,
    )
layout = go.Layout(
    title='sphere',
    autosize=False,
    width=1000,
    height=1000,
)
fig = go.Figure(data=[data], layout=layout)
plotly.offline.plot(fig, filename='/home/tucker/Downloads/sphere.html')

'/home/tucker/Downloads/sphere.html'

In [272]:
dims = 5

In [273]:
points.shape

(10002, 100)

In [274]:
-1*np.array([[1.0]*dims])

array([[-1., -1., -1., -1., -1.]])

In [277]:
points = np.random.rand(100*100,dims)
points = np.concatenate([points, -1*np.array([[1.0]*dims]), np.array([[1.0]*dims])], 0)
x = cosine_similarity(points, np.array([1.0]*dims)).reshape(-1)
y = np.arccos(x)
z = np.exp(x)
z2 = np.exp(x**2)
order = np.argsort(x)
z = z[order]
z2 = z2[order]
x = x[order]
y = y[order]


In [278]:
data = go.Scatter(
        x=y,
        y=x,mode='markers', name='cos'
    )
data2 = go.Scatter(
        x=y,
        y=z,mode='markers', name='expcos'
    )
data3 = go.Scatter(
        x=y,
        y=-1*np.gradient(z)/np.gradient(y),mode='markers', name='expcos grad'
    )
data4 = go.Scatter(
        x=y,
        y=-1*np.gradient(x)/np.gradient(y),mode='markers', name='cos grad'
    )
data5 = go.Scatter(
        x=y,
        y=z2,mode='markers', name='new'
    )
data6 = go.Scatter(
        x=y,
        y=-1*np.gradient(z2)/np.gradient(y),mode='markers', name='new grad'
    )

layout = go.Layout(
    title='cos',
    autosize=False,
    width=1000,
    height=1000,
)
fig = go.Figure(data=[data, data2, data3,data4,data5, data6], layout=layout)
plotly.offline.plot(fig, filename='/home/tucker/Downloads/sphere.html')

'/home/tucker/Downloads/sphere.html'

0.0