# Mixedbread Embedding Creation Test

State-of-the-art sentence embeddings from mixedbread.ai. 

https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1.

In [2]:
# Packages
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
# NLTK for sentence tokenization
import nltk
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ijyli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Pre-provided Code

In [3]:
# 1. load model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [4]:
# For retrieval you need to pass this prompt.
query = 'Represent this sentence for searching relevant passages: A man is eating a piece of bread'

docs = [
    query,
    "A man is eating food.",
    "A man is eating pasta.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
]

# 2. Encode
embeddings = model.encode(docs)

similarities = cos_sim(embeddings[0], embeddings[1:])
print('similarities:', similarities)

similarities: tensor([[0.7920, 0.6369, 0.1651, 0.3621]])


In [5]:
print('embedding 1 dimension')
print(embeddings[0].shape)

embedding 1 dimension
(1024,)


## A Terms of Service Document

In [6]:
# Load in '../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt'
with open('../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt', 'r') as file:
    apple_privacy_policy_text = file.read()

# Convert unicode to ascii
apple_privacy_policy_text = apple_privacy_policy_text.encode('ascii', 'ignore').decode('ascii')

print(apple_privacy_policy_text)


         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.
Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.
In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.
Privacy Icon for certain features that ask to use your personal information.



You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.
Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.

Your California Privacy Disclosures
Information Regarding Commercial Electronic Messages in Canada
Apple Health Research Apps Privacy Policy




 Collection and Use of PersonalInformation
Personal information is data that can be used to identify or contact a single person.
You may be a

In [7]:
# Sentence tokenize text
apple_sentences = nltk.sent_tokenize(apple_privacy_policy_text)
apple_sentences

['\n         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.',
 'Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.',
 'In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.',
 'Privacy Icon for certain features that ask to use your personal information.',
 'You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.',
 'Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.',
 'Your California Privacy Disclosures\nInformation Regarding Commercial Electronic Messages in Canada\nApple Health Research Apps Privacy Policy\n\n\n\n\n Collection and Use of PersonalInformation\nPersonal information is data that can be used to identify or contac

In [8]:
# Encode
apple_embeddings = model.encode(apple_sentences)

In [9]:
# Basic search - embedding most similar to query "Will Apple disclose non-personal information?"
query = "Represent this sentence for searching relevant passages: Will Apple disclose non-personal information?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

print('closest sentence:', apple_sentences[similarities.argmax()])

similarities: tensor([[0.7491, 0.7452, 0.5799, 0.5975, 0.7161, 0.5799, 0.6985, 0.8029, 0.7492,
         0.4846, 0.5817, 0.7509, 0.7690, 0.7444, 0.4755, 0.7464, 0.5463, 0.7225,
         0.4415, 0.5295, 0.5639, 0.4770, 0.4720, 0.5261, 0.5439, 0.6310, 0.5517,
         0.7259, 0.7424, 0.5071, 0.6814, 0.7336, 0.7451, 0.6593, 0.7458, 0.5512,
         0.5570, 0.4902, 0.4065, 0.5108, 0.5967, 0.6263, 0.7059, 0.7285, 0.4398,
         0.6263, 0.5184, 0.4475, 0.4946, 0.6132, 0.6140, 0.6317, 0.3748, 0.6054,
         0.5351, 0.6046, 0.6006, 0.6154, 0.5238, 0.5455, 0.5272, 0.7136, 0.6398,
         0.6399, 0.3473, 0.5087, 0.5976, 0.5468, 0.6450, 0.4608, 0.4435, 0.5987,
         0.4020, 0.4719, 0.3841, 0.7243, 0.5745, 0.3490, 0.4239, 0.4974, 0.3927,
         0.5292, 0.7708, 0.4071, 0.7605, 0.6151, 0.7062, 0.7182, 0.7097, 0.7834,
         0.5952, 0.6022, 0.5180, 0.5337, 0.7264, 0.6474, 0.6533, 0.5999, 0.7447,
         0.6034, 0.5785, 0.4725, 0.6950, 0.5155, 0.6107, 0.6440, 0.5519, 0.4468,
         0.632

In [10]:
# Basic search - embedding most similar to query "Is my privacy important to Apple?"
query = "Represent this sentence for searching relevant passages: Is my privacy important to Apple?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

print('closest sentence:', apple_sentences[similarities.argmax()])

similarities: tensor([[0.7330, 0.7747, 0.5608, 0.6449, 0.7320, 0.5847, 0.6802, 0.7835, 0.7294,
         0.4360, 0.5574, 0.7158, 0.7557, 0.6944, 0.4868, 0.7292, 0.4952, 0.7070,
         0.4632, 0.5287, 0.5784, 0.4681, 0.4740, 0.4850, 0.5426, 0.6320, 0.5556,
         0.7457, 0.7216, 0.4466, 0.6666, 0.6938, 0.7292, 0.6594, 0.7039, 0.5186,
         0.4939, 0.4694, 0.3765, 0.4503, 0.5202, 0.5082, 0.6460, 0.6930, 0.4256,
         0.5471, 0.4741, 0.4306, 0.4952, 0.6000, 0.5353, 0.6661, 0.4131, 0.5382,
         0.5176, 0.5234, 0.5947, 0.6487, 0.5077, 0.5516, 0.5144, 0.7494, 0.6675,
         0.6657, 0.3886, 0.5430, 0.6360, 0.6151, 0.6994, 0.5161, 0.4697, 0.6436,
         0.3968, 0.4402, 0.3968, 0.6657, 0.5808, 0.3824, 0.4204, 0.5316, 0.4105,
         0.5169, 0.7332, 0.3455, 0.7075, 0.6025, 0.7124, 0.7185, 0.7390, 0.7603,
         0.5531, 0.5490, 0.4486, 0.4834, 0.7886, 0.6823, 0.6995, 0.5960, 0.7606,
         0.5922, 0.5353, 0.5324, 0.6650, 0.4620, 0.6097, 0.6936, 0.5612, 0.5018,
         0.702