# ToS Mixedbread Embedding Creation

State-of-the-art sentence embeddings from mixedbread.ai. 

https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1.

In [52]:
# Packages
from sentence_transformers import SentenceTransformer
# NLTK for sentence tokenization
import nltk
nltk.download('punkt')
# Torch to move to GPU
import torch
import os
import pandas as pd
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ijyli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
# Flag for if this is a sample run or not
sample_run = True

## Load Model

In [54]:
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# Move to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# Print model
print(model)
# Print device
print(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
cpu


## Function to Get Sentences and Embeddings from Doc

In [55]:
def get_doc_sentences_embeddings(filename):
    '''
    Get sentence embeddings from a document.
    '''
    # Read file
    with open('../Text Data/Terms of Service Corpus/text/' + filename, 'r', encoding='utf-8') as f:
        doc = f.read()
    # Strip all unicode characters
    doc = ''.join([char if ord(char) < 128 else ' ' for char in doc])
    # Strip lines that do not contain a period, exclamation point, or question mark, and at least one letter
    doc = '\n'.join([line for line in doc.split('\n') if ('.' in line or '!' in line or '?' in line) and any([char.isalpha() for char in line])])
    print('number of lines in doc:', len(doc.split('\n')))
    # Parse company name by taking the first part of filename before underscore
    company_name = filename.split('_')[0]
    # Sentence tokenize the document
    sentences = nltk.sent_tokenize(doc)
    # Append company name to the beginning of each sentence
    sentences = [company_name + ' ' + sentence for sentence in sentences]
    # Encode sentences and return
    embeddings = model.encode(sentences)
    return sentences, embeddings
    

## Get List of files

In [56]:
filenames = os.listdir('../Text Data/Terms of Service Corpus/text')
print('length of filenames:', len(filenames))
print(filenames[:5])

length of filenames: 9491
['115media_CookiePolicy.txt', '115media_privacy.txt', '115media_Terms.txt', '11soundboards_PrivacyPolicy.txt', '123tvlive_PrivacyPolicy.txt']


## Clear pre-existing folder

In [62]:
# Delete files in '../Text Data Embeddings' directory
for file in os.listdir('../Text Data Embeddings'):
    os.remove('../Text Data Embeddings/' + file)

## Encode Documents and Create Parquet Files

In [57]:
def create_parquet(filename):
    '''
    Save parquet file of sentences and embeddings for a document.
    '''
    # Get sentences and embeddings
    # Start timer
    start = time.time()
    sentences, embeddings = get_doc_sentences_embeddings(filename)
    # End timer
    end = time.time()
    print('Time to get embeddings:', end - start)
    # New timer
    start = time.time()
    # Create dataframe
    df = pd.DataFrame()
    # Sentences are a column
    df['sentence'] = sentences
    # Add embeddings array on to the dataframe
    df = pd.concat([df, pd.DataFrame(embeddings)], axis=1)
    # Set column names
    column_names = ['sentence'] + [f'embed_element_{i}' for i in range(embeddings.shape[1])]
    df.columns = column_names
    # add filename column
    df['filename'] = filename # note parquet compression handles this constant value well
    # Copy dataframe 40 times to test sharding
    #df = pd.concat([df]*40, ignore_index=True)
    # If more than 1000 rows, split into 1000 row chunks
    if df.shape[0] > 1000:
        for i in range(0, df.shape[0], 1000):
            df.iloc[i:i+1000].to_parquet('../Text Data Embeddings/' + filename.split('.')[0] + f'_{i // 1000}.parquet')
    else:
        # Save to parquet
        df.to_parquet('../Text Data Embeddings/' + filename.split('.')[0] + '.parquet')
    # End timer
    end = time.time()
    print('Time to save parquet:', end - start)

In [58]:
# If this is a sample run, run for one filename + the largest file, otherwise, run for all
if sample_run:
    create_parquet(filenames[0])
    #create_parquet('Honeywell_CookieNotice.txt')

number of lines in doc: 47
Time to get embeddings: 8.653027296066284
Time to save parquet: 0.8176708221435547


In [59]:
# Load a file to check
test_df = pd.read_parquet('../Text Data Embeddings/' + filenames[0].split('.')[0] + '.parquet')
test_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Text Data Embeddings/115media_CookiePolicy.parquet'

In [None]:
# Print all values of sentence column
for value in test_df['sentence'].values:
    print(value)

115media Careers &amp.
115media Why Work for 1105?
115media Apply Today!
115media How Can We Help?
115media 1105 Media uses industry standard cookie technology (examples below) in order to maximize customer experience and operational efficiency.&nbsp.
115media When you visit one of 1105 Media's websites, we may send you a cookie.
115media A cookie is a small file which is placed on your computer or device.
115media These cookies are essential for the operation of our websites.
115media Without the use of these cookies, parts of our websites would not function.
115media For example, we use cookies to help us identify which of our readers have previously registered in order to access premium content on our websites.
115media We use these types of cookies to monitor our websites' performance and how users may interact with it.
115media These cookies provide us with information that helps us provide better products to our users and also to identify any areas that may need maintenance.
115m

In [None]:
# Load Honeywell file to check
test_df = pd.read_parquet('../Text Data Embeddings/Honeywell_CookieNotice.parquet')
test_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Text Data Embeddings/Honeywell_CookieNotice.parquet'