# Mixedbread Embedding Creation Test

State-of-the-art sentence embeddings from mixedbread.ai. 

https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1.

In [2]:
# Packages
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
# NLTK for sentence tokenization
import nltk
nltk.download('punkt')
import pickle
# timing (admittedly on CPU)
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ijyli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pre-provided Code

In [3]:
# 1. load model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [4]:
# For retrieval you need to pass this prompt.
query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread'

docs = [
    query,
    "A man is eating food.",
    "A man is eating pasta.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
]

# 2. Encode
embeddings = model.encode(docs)

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)

similarities: tensor([[0.7920, 0.6369, 0.1651, 0.3621]])


In [5]:
print('embedding 1 dimension')
print(embeddings[0].shape)

embedding 1 dimension
(1024,)


## A Terms of Service Document

In [6]:
# Load in '../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt'
with open('../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt', 'r', encoding='utf-8') as f:
    apple_privacy_policy_text = f.read()

# Convert unicode to ascii
apple_privacy_policy_text = apple_privacy_policy_text.encode('ascii', 'ignore').decode('ascii')

print(apple_privacy_policy_text)


         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.
Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.
In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.
Privacy Icon for certain features that ask to use your personal information.



You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.
Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.

Your California Privacy Disclosures
Information Regarding Commercial Electronic Messages in Canada
Apple Health Research Apps Privacy Policy




 Collection and Use of PersonalInformation
Personal information is data that can be used to identify or contact a single person.
You may be a

In [7]:
# Sentence tokenize text
apple_sentences = nltk.sent_tokenize(apple_privacy_policy_text)
# Append "Apple" to the beginning of each sentence
apple_sentences = ['Apple ' + sentence for sentence in apple_sentences]
# # Split on \n as well
# split_on_newline = []
# for sentence in apple_sentences:
#     split_on_newline.extend(sentence.split('\n'))
# split_on_newline
apple_sentences

['Apple \n         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.',
 'Apple Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.',
 'Apple In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.',
 'Apple Privacy Icon for certain features that ask to use your personal information.',
 'Apple You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.',
 'Apple Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.',
 'Apple Your California Privacy Disclosures\nInformation Regarding Commercial Electronic Messages in Canada\nApple Health Research Apps Privacy Policy\n\n\n\n\n Collection and Use of PersonalInformation\nPersonal information is d

In [8]:
# Encode
# Start timer
start = time.time()
apple_embeddings = model.encode(apple_sentences)
end = time.time()
print('Time to encode:', end - start)

In [9]:
# Basic search - embeddings most similar to query "Will Apple disclose non-personal information?"
query = "Represent this sentence for searching relevant passages: Will Apple disclose non-personal information?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

# Top 5 most similar sentences
sims = similarities.tolist()[0]
print(sims)
top5_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:5]
print(top5_idx)
for idx in top5_idx:
    print(apple_sentences[idx])
    print(sims[idx])

similarities: tensor([[0.7324, 0.7428, 0.7376, 0.7222, 0.7295, 0.7321, 0.7454, 0.7974, 0.7400,
         0.6526, 0.7576, 0.7435, 0.7592, 0.7477, 0.6067, 0.7475, 0.7432, 0.7240,
         0.6045, 0.6961, 0.7295, 0.6652, 0.6439, 0.7034, 0.7194, 0.6346, 0.7325,
         0.7326, 0.7426, 0.6824, 0.6696, 0.7396, 0.7438, 0.6765, 0.7406, 0.7287,
         0.7502, 0.6388, 0.5685, 0.6277, 0.7595, 0.7931, 0.7427, 0.7478, 0.6326,
         0.7863, 0.6935, 0.6346, 0.6703, 0.7088, 0.7808, 0.6307, 0.5525, 0.7593,
         0.6938, 0.7656, 0.5954, 0.6240, 0.6194, 0.5875, 0.6338, 0.7148, 0.6303,
         0.6403, 0.5360, 0.6675, 0.6513, 0.5810, 0.6627, 0.5474, 0.5689, 0.6007,
         0.5599, 0.6332, 0.5847, 0.7381, 0.5755, 0.5256, 0.5756, 0.6273, 0.5003,
         0.6952, 0.7698, 0.6193, 0.7796, 0.6298, 0.7057, 0.7125, 0.6973, 0.7791,
         0.7884, 0.7729, 0.7554, 0.6938, 0.7238, 0.6727, 0.6723, 0.6353, 0.7540,
         0.7726, 0.7514, 0.6527, 0.7343, 0.7323, 0.6680, 0.6581, 0.7186, 0.5626,
         0.649

In [10]:
# Basic search - embedding most similar to query "Is my privacy important to Apple?"
query = "Represent this sentence for searching relevant passages: Is my privacy important to Apple?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

# Top 5 most similar sentences
sims = similarities.tolist()[0]
print(sims)
top5_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:5]
print(top5_idx)
for idx in top5_idx:
    print(apple_sentences[idx])
    print(sims[idx])

similarities: tensor([[0.7168, 0.7763, 0.7256, 0.7707, 0.7446, 0.7450, 0.7270, 0.7831, 0.7217,
         0.6154, 0.7393, 0.7071, 0.7376, 0.7016, 0.6167, 0.7290, 0.6950, 0.7225,
         0.6339, 0.7057, 0.7462, 0.6570, 0.6534, 0.6698, 0.7251, 0.6416, 0.7374,
         0.7536, 0.7266, 0.6283, 0.6646, 0.6986, 0.7245, 0.6824, 0.7031, 0.7094,
         0.6967, 0.6217, 0.5411, 0.5740, 0.6961, 0.7025, 0.6801, 0.7135, 0.6254,
         0.7138, 0.6627, 0.6188, 0.6781, 0.6950, 0.7177, 0.6680, 0.5930, 0.7105,
         0.6756, 0.6984, 0.5907, 0.6496, 0.5896, 0.6010, 0.6232, 0.7501, 0.6696,
         0.6682, 0.5650, 0.7022, 0.6901, 0.6443, 0.7137, 0.6119, 0.6007, 0.6462,
         0.5571, 0.6105, 0.5957, 0.6749, 0.5900, 0.5636, 0.5823, 0.6648, 0.5210,
         0.6878, 0.7268, 0.5689, 0.7147, 0.6186, 0.7121, 0.7118, 0.7367, 0.7502,
         0.7487, 0.7167, 0.6953, 0.6492, 0.7905, 0.6956, 0.7050, 0.6355, 0.7677,
         0.7691, 0.7050, 0.6948, 0.7095, 0.6882, 0.6689, 0.7037, 0.7338, 0.6147,
         0.721

## Save Apple Embeddings to Disk to Estimate Size

In [11]:
# Pickle object
with open('../Test Embeddings/apple_embeddings.pkl', 'wb') as file:
    pickle.dump(apple_embeddings, file)