In [1]:
import time
import math

import pandas as pd
import numpy as np
from scipy.spatial import distance
from numpy.linalg import norm

In [2]:
EXECUTIONS = 100

In [3]:
file_data = np.genfromtxt('../tools/vectors.csv', delimiter=',')
A,B = np.moveaxis(file_data, 1, 0).astype('f')

In [4]:
results_df = pd.DataFrame(columns=['Implementation', 'Time (ms)', 'Result'])

In [5]:
# Plain Python implementation

def cosine(A, B):
    dot = denom_a = denom_b = 0.0

    for i in range(len(A)):
        dot += A[i] * B[i]
        denom_a += A[i] * A[i]
        denom_b += B[i] * B[i]

    return dot / (math.sqrt(denom_a) * math.sqrt(denom_b))

accum = 0

A_list = list(A)
B_list = list(B)

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A_list,B_list)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))


results_df.loc[len(results_df)] = ['Plain Python', accum/EXECUTIONS, cos_sim]

 306.00303173065186 ms


In [6]:
EXECUTIONS = 10000

In [7]:
# Scipy implementation

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = distance.cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['SciPy', accum/EXECUTIONS, cos_sim]

 0.552554440498352 ms


In [8]:
# NumPy implementation

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = 1 - np.dot(A, B)/(norm(A)*norm(B))
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy', accum/EXECUTIONS, cos_sim]

 0.7099000692367554 ms


In [9]:
# NumPy as Scipy implementation

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy as Scipy', accum/EXECUTIONS, cos_sim]

 0.6710227727890015 ms


In [13]:
# SciPy/NumPy implementation

A,B = np.moveaxis(file_data, 1, 0).astype('f')

A = np.asarray(A, dtype='float', order='c')
B = np.asarray(B, dtype='float', order='c')

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = 1 - np.dot(A, B)/(norm(A)*norm(B))
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

 0.043782711029052734 ms
 0.04303474426269531 ms


In [44]:
accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = 1 - np.dot(A, B)/(norm(A)*norm(B))
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

 0.04124276638031006 ms


In [38]:
import math
import numpy as np

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

 0.04785375595092774 ms


In [19]:
######

In [8]:
import math

def cosine(u, v, w=None):
    
    vw, uw = v, u
    uv = np.dot(u, vw)
    uu = np.dot(u, uw)
    vv = np.dot(v, vw)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

In [9]:
cosine(A,B)

1.4990165544834726

In [10]:
accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    dot = np.dot(A, B)
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

 0.2338648796081543 ms


In [27]:
import dis

dis.dis(distance.correlation)

575           0 RESUME                   0

625           2 LOAD_GLOBAL              1 (NULL + _validate_vector)
             14 LOAD_FAST                0 (u)
             16 PRECALL                  1
             20 CALL                     1
             30 STORE_FAST               0 (u)

626          32 LOAD_GLOBAL              1 (NULL + _validate_vector)
             44 LOAD_FAST                1 (v)
             46 PRECALL                  1
             50 CALL                     1
             60 STORE_FAST               1 (v)

627          62 LOAD_FAST                2 (w)
             64 POP_JUMP_FORWARD_IF_NONE    38 (to 142)

628          66 LOAD_GLOBAL              3 (NULL + _validate_weights)
             78 LOAD_FAST                2 (w)
             80 PRECALL                  1
             84 CALL                     1
             94 STORE_FAST               2 (w)

629          96 LOAD_FAST                2 (w)
             98 LOAD_FAST                2 (w)
       

In [29]:
import scipy.stats
import dis

# Disassemble the function
dis.dis(cosine)

  3           0 RESUME                   0

  6           2 LOAD_GLOBAL              0 (np)
             14 LOAD_METHOD              1 (dot)
             36 LOAD_FAST                0 (u)
             38 LOAD_FAST                0 (u)
             40 PRECALL                  2
             44 CALL                     2
             54 STORE_FAST               3 (uu)

  7          56 LOAD_GLOBAL              0 (np)
             68 LOAD_METHOD              1 (dot)
             90 LOAD_FAST                1 (v)
             92 LOAD_FAST                1 (v)
             94 PRECALL                  2
             98 CALL                     2
            108 STORE_FAST               4 (vv)

  9         110 LOAD_CONST               1 (1)
            112 LOAD_GLOBAL              0 (np)
            124 LOAD_METHOD              1 (dot)
            146 LOAD_FAST                0 (u)
            148 LOAD_FAST                1 (v)
            150 PRECALL                  2
            154 CALL   

In [1]:
def correlation(u, v, w=None, centered=True):
    """
    Compute the correlation distance between two 1-D arrays.

    The correlation distance between `u` and `v`, is
    defined as

    .. math::

        1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
                  {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}

    where :math:`\\bar{u}` is the mean of the elements of `u`
    and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.

    Parameters
    ----------
    u : (N,) array_like
        Input array.
    v : (N,) array_like
        Input array.
    w : (N,) array_like, optional
        The weights for each value in `u` and `v`. Default is None,
        which gives each value a weight of 1.0
    centered : bool, optional
        If True, `u` and `v` will be centered. Default is True.

    Returns
    -------
    correlation : double
        The correlation distance between 1-D array `u` and `v`.

    Examples
    --------
    Find the correlation between two arrays.

    >>> from scipy.spatial.distance import correlation
    >>> correlation([1, 0, 1], [1, 1, 0])
    1.5

    Using a weighting array, the correlation can be calculated as:

    >>> correlation([1, 0, 1], [1, 1, 0], w=[0.9, 0.1, 0.1])
    1.1

    If centering is not needed, the correlation can be calculated as:

    >>> correlation([1, 0, 1], [1, 1, 0], centered=False)
    0.5
    """
    #u = _validate_vector(u)
    #v = _validate_vector(v)
    if w is not None:
        #w = _validate_weights(w)
        w = w / w.sum()
    if centered:
        if w is not None:
            umu = np.dot(u, w)
            vmu = np.dot(v, w)
        else:
            umu = np.mean(u)
            vmu = np.mean(v)
        u = u - umu
        v = v - vmu
    if w is not None:
        vw = v * w
        uw = u * w
    else:
        vw, uw = v, u
    uv = np.dot(u, vw)
    uu = np.dot(u, uw)
    vv = np.dot(v, vw)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

In [9]:
accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = correlation(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

 0.8782026052474976 ms


In [2]:
import scipy.stats
import dis

# Disassemble the function
dis.dis(correlation)

  1           0 RESUME                   0

 53           2 LOAD_FAST                2 (w)
              4 POP_JUMP_FORWARD_IF_NONE    23 (to 52)

 55           6 LOAD_FAST                2 (w)
              8 LOAD_FAST                2 (w)
             10 LOAD_METHOD              0 (sum)
             32 PRECALL                  0
             36 CALL                     0
             46 BINARY_OP               11 (/)
             50 STORE_FAST               2 (w)

 56     >>   52 LOAD_FAST                3 (centered)
             54 POP_JUMP_FORWARD_IF_FALSE   119 (to 294)

 57          56 LOAD_FAST                2 (w)
             58 POP_JUMP_FORWARD_IF_NONE    55 (to 170)

 58          60 LOAD_GLOBAL              2 (np)
             72 LOAD_METHOD              2 (dot)
             94 LOAD_FAST                0 (u)
             96 LOAD_FAST                2 (w)
             98 PRECALL                  2
            102 CALL                     2
            112 STORE_FAST          

In [23]:
import time
import math

import numpy as np
from scipy.spatial import distance

SIZE = 6400000
EXECUTIONS = 10000

path = "" # From https://github.com/joseprupi/cosine-similarity-comparison/blob/master/tools/vectors.csv
file_data = np.genfromtxt(path, delimiter=',')
A,B = np.moveaxis(file_data, 1, 0).astype('f')

def cosine(u, v, w=None):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))
cos_sim_manual = cos_sim

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = distance.cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))
cos_sim_scipy = cos_sim

print(np.isclose(cos_sim_scipy, cos_sim_manual))



 0.5370989799499511 ms
 0.686272120475769 ms
True


In [14]:
t = np.empty((8, 4), 'uint32')[:, :2]
t = np.empty((8, 4), 'uint32')
print(t.flags)

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False



In [10]:
t.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [29]:
def generate_random_vector(size):
    """
    Generate 2 random vectors with the provided size
    and save them in a text file
    """
    A = np.random.normal(loc=1.5, size=(size,))
    B = np.random.normal(loc=-1.5, scale=2.0, size=(size,))
    vectors = np.stack([A, B], axis=1)
    np.savetxt('vectors.csv', vectors, fmt='%f,%f')

In [30]:
generate_random_vector(64000)

In [32]:
file_data = np.genfromtxt('../tools/vectors.csv', delimiter=',')
A,B = np.moveaxis(file_data, 1, 0).astype('f')

In [40]:
A = np.random.normal(loc=1.5, size=(10,2))

In [41]:
A.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [42]:
At = np.moveaxis(A, 1, 0).astype('f')

In [43]:
A.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [51]:
x = np.zeros((3, 4, 5))
T = np.moveaxis(x, 0, -1)
print(T.flags)

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False



  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [25]:
len(file_data)

64000