# ToS Mixedbread Embedding Creation

State-of-the-art sentence embeddings from mixedbread.ai. 

https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1.

In [1]:
# Packages
from sentence_transformers import SentenceTransformer
# NLTK for sentence tokenization
import nltk
nltk.download('punkt')
# Torch to move to GPU
import torch
import os
import pandas as pd
#import time

  from .autonotebook import tqdm as notebook_tqdm




[nltk_data] Downloading package punkt to
[nltk_data]     /accounts/grad/ijyliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Flag for if this is a sample run or not
sample_run = False

## Load Model

In [3]:
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# Move to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# Print model
print(model)
# Print device
print(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
cuda


## Function to Get Sentences and Embeddings from Doc

In [4]:
def get_doc_sentences_embeddings(filename):
    '''
    Get sentence embeddings from a document.
    '''
    # Read file
    with open('../Text Data/Terms of Service Corpus/text/' + filename, 'r', encoding='utf-8') as f:
        doc = f.read()
    # Strip all unicode characters
    doc = ''.join([char if ord(char) < 128 else ' ' for char in doc])
    # Strip lines that do not contain a period, exclamation point, or question mark, and at least one letter
    doc = '\n'.join([line for line in doc.split('\n') if ('.' in line or '!' in line or '?' in line) and any([char.isalpha() for char in line])])
    #print('number of lines in doc:', len(doc.split('\n')))
    # Parse company name by taking the first part of filename before underscore
    company_name = filename.split('_')[0]
    # Sentence tokenize the document
    sentences = nltk.sent_tokenize(doc)
    # Append company name to the beginning of each sentence
    # Format example: "Apple: We take your privacy seriously."
    sentences = [company_name + ': ' + sentence for sentence in sentences]
    # Encode sentences and return
    embeddings = model.encode(sentences)
    return sentences, embeddings
    

## Get List of files

In [5]:
filenames = pd.read_excel('selected_list_of_files.xlsx')['filename'].tolist()
print('length of filenames:', len(filenames))
print(filenames[:5])

length of filenames: 1623
['1Password_AgileBitsPrivacyPolicy.txt', '1Password_TermsofService.txt', '23andMe_CenforceProfessional.txt', '23andMe_CookiePolicy.txt', '23andMe_PrivacyPolicy.txt']


## Clear pre-existing folder

In [6]:
# Delete files in '../Text Data Embeddings' directory
for file in os.listdir('../Text Data Embeddings'):
    os.remove('../Text Data Embeddings/' + file)

## Encode Documents and Create Parquet Files

In [7]:
def create_parquet(filename):
    '''
    Save parquet file of sentences and embeddings for a document.
    '''
    # Get sentences and embeddings
    # Start timer
    #start = time.time()
    sentences, embeddings = get_doc_sentences_embeddings(filename)
    # End timer
    #end = time.time()
    #print('Time to get embeddings:', end - start)
    # New timer
    #start = time.time()
    # Create dataframe
    df = pd.DataFrame()
    # Sentences are a column
    df['sentence'] = sentences
    # Add embeddings array on to the dataframe
    df = pd.concat([df, pd.DataFrame(embeddings)], axis=1)
    # Set column names
    column_names = ['sentence'] + [f'embed_element_{i}' for i in range(embeddings.shape[1])]
    df.columns = column_names
    # add filename column
    df['filename'] = filename # note parquet compression handles this constant value well
    # Copy dataframe 40 times to test sharding
    #df = pd.concat([df]*40, ignore_index=True)
    # If more than 1000 rows, split into 1000 row chunks
    if df.shape[0] > 1000:
        for i in range(0, df.shape[0], 1000):
            df.iloc[i:i+1000].to_parquet('../Text Data Embeddings/' + filename.split('.')[0] + f'_{i // 1000}.parquet')
    else:
        # Save to parquet
        df.to_parquet('../Text Data Embeddings/' + filename.split('.')[0] + '.parquet')
    # End timer
    #end = time.time()
    #print('Time to save parquet:', end - start)

In [8]:
# If this is a sample run, run for one filename + the largest file, otherwise, run for all
if sample_run:
    create_parquet(filenames[0])
    #create_parquet('Honeywell_CookieNotice.txt')
else:
    errored_files = []
    for filename in filenames:
        try:
            create_parquet(filename)
        except:
            errored_files.append(filename)
            continue
    # save to disk if errored files
    if len(errored_files) > 0:
        with open('errored_files.txt', 'w') as f:
            for file in errored_files:
                f.write(file + '\n')
    else:
        print('No errors in creating parquet files.')

In [9]:
# Load a file to check
test_df = pd.read_parquet('../Text Data Embeddings/' + filenames[0].split('.')[0] + '.parquet')
test_df.head()

Unnamed: 0,sentence,embed_element_0,embed_element_1,embed_element_2,embed_element_3,embed_element_4,embed_element_5,embed_element_6,embed_element_7,embed_element_8,...,embed_element_1015,embed_element_1016,embed_element_1017,embed_element_1018,embed_element_1019,embed_element_1020,embed_element_1021,embed_element_1022,embed_element_1023,filename
0,"1Password: Last updated: July 3, 2019Y...",0.157638,0.035604,-0.132186,0.116704,0.202227,0.174079,0.023847,0.144862,0.061449,...,-0.349894,0.019318,0.302244,0.805569,0.556341,0.426748,-0.556337,-0.527679,0.086249,1Password_AgileBitsPrivacyPolicy.txt
1,1Password: This privacy policy explains the pe...,-0.513479,-0.104573,-0.18725,-0.235499,0.34794,0.123198,0.182524,-0.426857,0.014124,...,-1.274716,-0.30128,0.124505,0.074597,0.33194,-0.104734,-0.752499,0.130692,0.443632,1Password_AgileBitsPrivacyPolicy.txt
2,1Password: This privacy policy further describ...,-0.383306,-0.126484,-0.179484,-0.245022,0.331592,0.367131,-0.024387,-0.076363,0.560197,...,-0.779433,-0.310576,0.222113,0.243597,0.913351,0.318467,-0.656026,-0.199383,0.536777,1Password_AgileBitsPrivacyPolicy.txt
3,1Password: This policy applies to the interact...,-0.094262,0.144847,-0.40861,-0.216016,0.23568,0.081539,0.139658,-0.325716,-0.317717,...,-1.312816,-0.222076,0.094938,-0.063733,0.473961,0.280864,-0.03398,-0.089776,0.264955,1Password_AgileBitsPrivacyPolicy.txt
4,"1Password: After all, it is impossible to lose...",-0.31396,0.025345,0.037957,0.217967,0.260413,0.109066,-0.167638,-0.239258,0.489414,...,-0.315917,-0.435738,0.344863,0.264083,0.918006,0.565772,0.225252,-0.190404,0.488587,1Password_AgileBitsPrivacyPolicy.txt


In [10]:
# Print all values of sentence column
for value in test_df['sentence'].values:
    print(value)

1Password:         Last updated: July 3, 2019Your privacy is important to us.
1Password: This privacy policy explains the personal data that AgileBits collects and processes, how it processes data and for what purposes it is collected and processed.
1Password: This privacy policy further describes our commitment to preserving the privacy and security of your personal data.
1Password: This policy applies to the interactions that AgileBits has with you through your use of AgileBits 1Password products and services.Brief overview of our commitment to privacyAt AgileBits, we believe that the less information we know about you, the better.
1Password: After all, it is impossible to lose, misuse, or abuse information we don   t have.
1Password: To the extent that we have control over your data or data about you, we see ourselves as custodians of that data on your behalf.
1Password: We use your data solely to provide you with services in which you enroll.
1Password: Our business is providing 1P

In [11]:
# Load Honeywell file to check
#test_df = pd.read_parquet('../Text Data Embeddings/Honeywell_CookieNotice.parquet')
#test_df.head()