In [2]:
import time
import math

import pandas as pd
import numpy as np
from scipy.spatial import distance
from numpy.linalg import norm

In [10]:
EXECUTIONS = 100

In [11]:
file_data = np.genfromtxt('../tools/vectors.csv', delimiter=',')
A,B = np.moveaxis(file_data, 1, 0).astype('f')

In [12]:
results_df = pd.DataFrame(columns=['Implementation', 'Time (ms)', 'Result'])

In [13]:
# Plain Python implementation

def cosine(A, B):
    dot = denom_a = denom_b = 0.0
    
    for i in range(len(A)):
        dot += A[i] * B[i]
        denom_a += A[i] * A[i]
        denom_b += B[i] * B[i]

    return 1 - (dot / (math.sqrt(denom_a) * math.sqrt(denom_b)))

accum = 0

A_list = list(A)
B_list = list(B)

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A_list,B_list)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))


results_df.loc[len(results_df)] = ['Plain Python', accum/EXECUTIONS, cos_sim]

 299.15218591690063 ms


In [14]:
# Plain Python implementation. More Pythonic

def cosine(A, B):
    dot = denom_a = denom_b = 0.0

    dot = sum([a*b for a,b in zip(A,B)])
    denom_a = sum([x*x for x in A])
    denom_b = sum([x*x for x in B])

    return 1 - (dot / (math.sqrt(denom_a) * math.sqrt(denom_b)))

accum = 0

A_list = list(A)
B_list = list(B)

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A_list,B_list)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))


results_df.loc[len(results_df)] = ['Plain Python More Pythonic', accum/EXECUTIONS, cos_sim]

 271.6829204559326 ms


In [15]:
EXECUTIONS = 10000

In [7]:
# Scipy implementation

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = distance.cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['SciPy', accum/EXECUTIONS, cos_sim]

 0.5493692636489869 ms


In [8]:
# NumPy implementation

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = 1 - np.dot(A, B)/(norm(A)*norm(B))
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy', accum/EXECUTIONS, cos_sim]

 0.6952577352523803 ms


In [9]:
print(results_df.sort_values(results_df.columns[1]).to_markdown())

|    | Implementation   |   Time (ms) |   Result |
|---:|:-----------------|------------:|---------:|
|  1 | SciPy            |    0.549369 |  1.49902 |
|  2 | NumPy            |    0.695258 |  1.49902 |
|  0 | Plain Python     |  323.389    |  1.49902 |


In [10]:
# NumPy as Scipy implementation

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy as Scipy', accum/EXECUTIONS, cos_sim]

 0.6714282274246216 ms


In [11]:
print(results_df.sort_values(results_df.columns[1]).to_markdown())

|    | Implementation   |   Time (ms) |   Result |
|---:|:-----------------|------------:|---------:|
|  1 | SciPy            |    0.549369 |  1.49902 |
|  3 | NumPy as Scipy   |    0.671428 |  1.49902 |
|  2 | NumPy            |    0.695258 |  1.49902 |
|  0 | Plain Python     |  323.389    |  1.49902 |


In [36]:
# SciPy/NumPy implementation

A,B = np.moveaxis(file_data, 1, 0).astype('f')

A = np.asarray(A, dtype='float', order='c')
B = np.asarray(B, dtype='float', order='c')

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = 1 - np.dot(A, B)/(norm(A)*norm(B))
    accum +=  (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

def cosine(u, v):
    
    uv = np.dot(u, v)
    uu = np.dot(u, u)
    vv = np.dot(v, v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy with contiguous mem', accum/EXECUTIONS, cos_sim]

 0.04193553924560547 ms
 0.04263992309570312 ms


In [13]:
print(results_df.sort_values(results_df.columns[1]).to_markdown())

|    | Implementation            |   Time (ms) |   Result |
|---:|:--------------------------|------------:|---------:|
|  4 | NumPy with contiguous mem |   0.0437653 |  1.49902 |
|  1 | SciPy                     |   0.549369  |  1.49902 |
|  3 | NumPy as Scipy            |   0.671428  |  1.49902 |
|  2 | NumPy                     |   0.695258  |  1.49902 |
|  0 | Plain Python              | 323.389     |  1.49902 |


In [34]:
from scipy.linalg.blas import ddot

# NumPy as Scipy implementation

def cosine(u, v):
    
    uv = ddot(u,v)
    uu = ddot(u,u)
    vv = ddot(v,v)
    dist = 1.0 - uv / math.sqrt(uu * vv)
    # Clip the result to avoid rounding error
    return np.clip(dist, 0.0, 2.0)

accum = 0

for _ in range(EXECUTIONS):
    start_time = time.time()
    cos_sim = cosine(A,B)
    accum += (time.time() - start_time) * 1000

print(" %s ms" % (accum/EXECUTIONS))

results_df.loc[len(results_df)] = ['NumPy as Scipy', accum/EXECUTIONS, cos_sim]

 0.040762686729431154 ms


In [14]:
np.show_config()

Build Dependencies:
  blas:
    detection method: pkgconfig
    found: true
    include directory: /usr/local/include
    lib directory: /usr/local/lib
    name: openblas64
    openblas configuration: USE_64BITINT=1 DYNAMIC_ARCH=1 DYNAMIC_OLDER= NO_CBLAS=
      NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= HASWELL MAX_THREADS=2
    pc file directory: /usr/local/lib/pkgconfig
    version: 0.3.23.dev
  lapack:
    detection method: internal
    found: true
    include directory: unknown
    lib directory: unknown
    name: dep140213194937296
    openblas configuration: unknown
    pc file directory: unknown
    version: 1.26.4
Compilers:
  c:
    args: -fno-strict-aliasing
    commands: cc
    linker: ld.bfd
    linker args: -Wl,--strip-debug, -fno-strict-aliasing
    name: gcc
    version: 10.2.1
  c++:
    commands: c++
    linker: ld.bfd
    linker args: -Wl,--strip-debug
    name: gcc
    version: 10.2.1
  cython:
    commands: cython
    linker: cython
    name: cython
    versio

In [15]:
import numpy as np
from scipy.linalg.blas import ddot
from timeit import default_timer as timer
v = np.ones(100000)
start = timer()
for k in range(1000000):
    s = ddot(v,v)
exec_time=(timer() - start)
print
print("Execution took", str(round(exec_time, 3)), "seconds")

Execution took 5.066 seconds
