In [None]:
import numpy as np

Calculating pairwise distances between some things where you have to write the distance metric yourself

* Defining custom distance metrics for umap:
    * https://umap-learn.readthedocs.io/en/latest/parameters.html?highlight=numba#metric
* Calculating interactions of some particles

If you want to compute pairwise distances of an array you could use the scipy function `pdist`. But lets take a quick look at how that works:

In [None]:
from scipy.spatial.distance import pdist

In [None]:
??pdist

If you define your own metric

In [None]:
import matplotlib.pyplot as plt

In [None]:
data = np.random.rand(1000, 4)

In [None]:
plt.scatter(data[:, 1], data[:, 2], c=data)

In [None]:
np.random.rand(100, 3)

In [None]:
import numba
import numpy as np

In [None]:
@numba.njit
def pairwise(x, metric):
    m = np.zeros((x.shape[0], x.shape[0]))
    for i in range(x.shape[0]):
        for j in range(i+1, x.shape[0]):
            m[i, j] = metric(x[i, :], x[j, :])
    return m

In [None]:
o = np.ones((10, 10))

In [None]:
@numba.njit
def euclidean_dist(a, b):
    return np.sum((a - b) ** 2) ** 0.5

In [None]:
a = np.random.rand(5, 2)

In [None]:
ds = pairwise(a, euclidean_dist)
ds

# Challenge

In [None]:
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = fetch_20newsgroups(categories=["alt.atheism", "soc.religion.christian"])

In [None]:
vectoriser = CountVectorizer(dtype=np.int16)

In [None]:
sparsedata = vectoriser.fit_transform(data.data).tocsr()

# COO format

Sparse matrix in csr format:

Three arrays

```
indices
indptr
data
```

COO format

# Jaccard similarity



$$ Jaccard = \frac{|s_i \cap s_j|}{|s_i \cup s_j|} $$

In [None]:
@numba.njit
def pairwise_sections(sections, metric):
    d = dict()
    for i in range(len(sections)):
        sect1 = sections[i]
        for j in range(i+1, len(sections)):
            sect2 = sections[j]
            v = metric(sections[i], sections[j])
            if v > 0:
                d[(i, j)] = v
    return d

In [139]:
sections = np.split(sparsedata.indices, sparsedata.indptr[1:])

In [None]:
@numba.njit
def jaccard_similarity(a, b):
    intersect = len(set(a).intersection(set(b)))
    return intersect / (len(a) + len(b) - intersect)

In [None]:
%time dists = pairwise_sections(sections, jaccard_similarity);