### Vector search

https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/#std-term-vector

In [None]:
mongo_dsn="putYourAddressHere"

In [None]:
pip install matplotlib seaborn scikit-learn

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import offsetbox
from sklearn import manifold, datasets, decomposition
from pandas import pandas as pd

## Akinator example

In [None]:
from collections import namedtuple

Features = namedtuple("Feature", ["hair", "fat", "slim", "short", "tall", "like_honey", "live_in_forest"])

kubus = Features(0,1,0,1,0,1,1)
prosiaczek = Features(0,0,1,1,0,0,1)
osiol = Features(1,1,0,0,0,0,1)

answers = Features(1,1,0,0,1,1,1)

#[no, probably_no, idk, probably_yes, yes] = [-1, -0.5, 0, 0.5, 1]


In [None]:
from scipy.spatial.distance import cosine

## No distance

In [None]:
cosine(kubus, kubus)

## some distance

In [None]:
cosine(kubus, prosiaczek)

In [None]:
cosine(answers, kubus)

In [None]:
cosine(answers, prosiaczek)

In [None]:
characters = [('kubus', kubus), ('prosiaczek', prosiaczek), ('osiol', osiol)]

def guess_character(answer):
    distances = [(character, cosine(answer, features)) for character, features in characters]
    sorted_distances = sorted(distances, key=lambda x: x[1])
    return "Your character is...: {}".format(sorted_distances[0][0])

In [None]:
guess_character(answers)

## Other vectors

In [None]:
digits = datasets.load_digits(n_class = 10)    # Loading the data using the sklearn library

In [None]:
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30

In [None]:
df = pd.DataFrame(zip(X, y), columns = ['vec', 'digit'])

In [None]:
df

In [None]:
_0_1 ,_0_2, _0_3 = df[df['digit'] == 0][:3]['vec']

In [None]:
from scipy.spatial.distance import cosine

In [None]:
cosine(_0_1, _0_2)

In [None]:
cosine(_0_1, _0_3)

In [None]:
_1_1,_1_2,_1_3 = df[df['digit'] == 1][:3]['vec']

In [None]:
cosine(_1_1, _1_2)

In [None]:
cosine(_1_1, _1_3)

In [None]:
cosine(_0_1, _1_2)

In [None]:
assert cosine(_0_1, _1_1) > cosine(_0_1, _0_2)

### Drawing

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data provided (reshaped to 8x8 grid)
data = np.array([
    0.,  0.,  1.,  9., 15., 11.,  0.,  0.,  0.,  0., 11., 16.,  8.,
    14.,  6.,  0.,  0.,  2., 16., 10.,  0.,  9.,  9.,  0.,  0.,  1.,
    16.,  4.,  0.,  8.,  8.,  0.,  0.,  4., 16.,  4.,  0.,  8.,  8.,
     0.,  0.,  1., 16.,  5.,  1., 11.,  3.,  0.,  0.,  0., 12., 12.,
    10., 10.,  0.,  0.,  0.,  0.,  1., 10., 13.,  3.,  0.,  0.
]).reshape(8, 8)

def draw_digit(data):
    plt.figure(figsize=(5, 5))
    plt.imshow(data, cmap='binary')
    plt.show()

draw_digit(data)

In [None]:
X_tsne = manifold.TSNE(n_components = 3, init = 'pca', random_state = 0).fit_transform(X)

In [None]:
tsneDF = pd.DataFrame(zip(X_tsne, y), columns=['vec', 'digit'])

In [None]:
tsneDF

In [None]:
a0_1 ,a0_2, a0_3 = tsneDF[tsneDF['digit'] == 0][:3]['vec']

In [None]:
b1_1,b1_2,b1_3 = tsneDF[tsneDF['digit'] == 1][:3]['vec']

In [None]:
cosine(a0_1, a0_2)

In [None]:
cosine(b1_1, b1_2)

In [None]:
cosine(a0_1, b1_1)

In [None]:
assert cosine(a0_2, b1_2) > cosine(b1_1, b1_2)

In [None]:
pip install pymongo

In [None]:
import pymongo
# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(mongo_dsn)
db = mongo_client["ds"]
collection = db["digits"]

In [None]:
mongo_client.admin.command('ping')

In [None]:
tsneDF[:10]

In [None]:
tsneDF['float_vec'] = tsneDF.vec.apply(lambda x: list(map(float, x)))

In [None]:
tsneDF

In [None]:
from itertools import batched

for batch in batched(tsneDF.iterrows(), 20):
    to_be_inserted = [{ "digit": row['digit'], "embedding": row['float_vec'] } for index, row in batch]
    collection.insert_many(to_be_inserted)

In [None]:
from pymongo.operations import SearchIndexModel
# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "similarity": "cosine",
        #"similarity": "euclidean",
        "numDimensions": 3
      }
    ]
  },
  name="vector_index",
  type="vectorSearch",
)
collection.create_search_index(model=search_index_model)


In [None]:
rows = collection.find({})

In [None]:
for i in (next(rows) for i in range(10)):
    display(i)

In [None]:
# define pipeline
pipeline = [
  {
    '$vectorSearch': {
      'index': 'vector_index',
      'path': 'embedding',
      'queryVector': [0.48, -17, 2.60],
      'numCandidates': 200,
      'limit': 200
    }
  }, {
    '$project': {
      '_id': 0,
      'digit': 1,
      'score': {
        '$meta': 'vectorSearchScore'
      },
      'embedding': 1
    }
  }
]
# run pipeline
result = collection.aggregate(pipeline)
# print results
for i in result:
    print(i)