# Preprocessing and Embeddings

In [7]:
import os
import openai

In [8]:
openai.api_key = os.environ['OPENAI_API_KEY']

## Preprocessing:

In [10]:
import pandas as pd
import re
import string

In [11]:
df = pd.read_csv('2_scraped_sections.csv')

In [12]:
## Data Cleaning
def remove_unwanted_text(df, unwanted_text):
   df['text'] = df['text'].apply(lambda x: x.replace(unwanted_text, '', 1) if x.starstswith(unwanted_text) else x)
   return df

def search_and_remove_patterns(df, pattern_list, verbose=False):
    """
    This function searches and removes patterns from the pattern list, 
    showing you which matches and where have they been found,
    also telling you how many matches have been replaced by ''.
    """
    words_removed = []
    for i, text in enumerate(df['text']):
        if verbose: print(f'\n\n**************Text {i}****************')
        matched_search = True
        while matched_search is True:
            matched_search = False
            for pattern in pattern_list:
                result = re.search(pattern, df.loc[i, 'text'], flags=0)
                if result!=None:
                    if verbose: print(f'match: {result[0]}')
                    matched_search = True
                    words_removed.append(result[0])
                    df.loc[i, 'text'] = df.loc[i, 'text'].replace(result[0], '').strip() # strip removes whitespaces accumulated at the start/end of the string
    print(f'{len(words_removed)} words were removed ({len(set(words_removed))} different words): \n{set(words_removed)}')
    return df


def preprocessing_transformations(text):
    """
    With thisfunction we remove PDF files [r'\(PDF, \d+\.\d* KB\)', r'\(PDF, \d+\.\d* MB\)', r'\(PDF\)'], tags r'\[\d+\]', and excessive whitespace r'\s+'.
    """
    pdf_pattern_list = [r'\(PDF, \d+\.\d* KB\)', r'\(PDF, \d+\.\d* MB\)', r'\(PDF\)'] # for PDFs 
    # Remove PDF files
    for pattern in pdf_pattern_list:
        text = re.sub(pattern, '', text)
    # Remove reference tags
    text = re.sub(r'\[\d+\]', '', text)
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove line breaks
    text = re.sub(r'\n', '', text)

    return text

def preprocess(df):
    ''' 
    Processes the text using the transformations defined in preproce
    '''
    df['aggregate_title'] = df['title'] + ' ' + df['section']
    df['text'] = df['text'].apply(preprocessing_transformations) # apply preprocessing
    return df

df_processed = preprocess(df)[['url', 'title', 'section', 'aggregate_title', 'text']] # Preprocessing of the text and rearrangement of columns
df_processed.to_csv('4_clean_text.csv', index=False)

## Encoding:

In [13]:
#!pip install openai
#!pip install tiktoken
import tiktoken

Using the tokenizer from OpenAI, Tiktoken, I am going to calculate the number of tokens I will be feeding to the embedding model per request. There is a Max Token Limit, for  text-embedding-ada-002 this limit is 8191 tokens. 

In [14]:
def encoding(text):
    encoding = tiktoken.get_encoding('cl100k_base') # tiktoken.encoding_for_model("gpt-4") would chose the encoding four us
    tokenized_text = encoding.encode(text)
    return tokenized_text

def decoding(tokenized_text):
    encoding = tiktoken.get_encoding('cl100k_base') 
    text = encoding.decode(tokenized_text)
    return text

max_token_limit = 8191

def get_tokens(df):
    token_list = []
    for text in df['text']:
        tokens = encoding(text)
        token_list.append(tokens)
    df['tokenized_text'] = token_list
    return df

def get_n_tokens(df):
    token_len_list = []
    for i in df['tokenized_text']:
        token_len_list.append(len(i))
    df['n_tokens'] = token_len_list
    return df

In [15]:
df_encoded = get_tokens(df_processed.copy()) # get the encoded (tokens) text for each row, creates new column ['tokenized_text]
df_encoded = get_n_tokens(df_encoded) # get the number of tokens, the length of the tokenized/encoded text

Now we need to split the text of those requests whose number of tokens are over the MaxTokenLimit.

In [16]:
def sliding_window_2(df, max_token_limit=8191):
    '''
    To avoid hitting the Max Token Limit error when using embedding models such as OpenAI's Ada-02 
    this function will chunk a document into smaller contexts that require less tokens.
    It creates two windows of with the maximun amount of tokens that the embedding model can handle.
    If the windows do not overlap
    '''
    index_of_sections_over_limit = df[df['n_tokens'] > max_token_limit].index.tolist()
    if len(index_of_sections_over_limit) > 0:
        print(f'{len(index_of_sections_over_limit)} documents exceded the Max Token Limit, w/ indexes: {index_of_sections_over_limit}')
        df['window'] = 0
        # Prepare df to slide the windows by duplicating the rows of interest
        for i in index_of_sections_over_limit:
            # Create windows in different rows of course.
            df.loc[i + 0.5] = df.loc[i].copy() # Duplicate row
            df.loc[i + 0.5, 'window'] = 1 # Mark Window index
            # Decode truncated Tokens to get truncated text
            df.loc[i, 'text'] = decoding(df.loc[i, 'tokenized_text'][0:max_token_limit]) # window 0
            df.loc[i + 0.5, 'text'] = decoding(df.loc[i + 0.5, 'tokenized_text'][-max_token_limit:]) # window 1
            # Update Encodings for the truncated text using df.at because we are dealing with a column of lists
            df.at[i, 'tokenized_text'] = encoding(df.loc[i, 'text']) # window 0
            df.at[i + 0.5, 'tokenized_text'] = encoding(df.loc[i + 0.5, 'text']) # window 1

        # Reset indexes because we have inserted rows using float index (i+0.5)
        df = df.sort_index().reset_index(drop=True)
        # Update the length of the resulting encodings as we created 2 windows for the same text
        df = get_tokens(df)
        df = get_n_tokens(df)
    else:
        print('No windows needed.')

    return df

df_windows = sliding_window_2(df_encoded.copy())

2 documents exceded the Max Token Limit, w/ indexes: [37, 46]


We don't really need to tokenize the text beforehand as the tokenize is already integrated in the embedding model.

We can parallelize the embedding transformation using, swifter.

In [18]:
#!pip install swifter
import swifter

In [24]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

df_windows['embedded_title'] = df_windows['aggregate_title'].swifter.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df_windows['embedded_text'] = df_windows['text'].swifter.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

Pandas Apply:   0%|          | 0/198 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/198 [00:00<?, ?it/s]

In [30]:
df_windows

Unnamed: 0,url,title,section,aggregate_title,text,tokenized_text,n_tokens,window,embedded_title,embedded_text
0,https://www.uscis.gov/working-in-the-united-st...,Working in the United States,Topics,Working in the United States Topics,Many noncitizens want to come to the United St...,"[8607, 2536, 54961, 30060, 1390, 311, 2586, 31...",651,0,"[-0.008848007768392563, -0.02041037194430828, ...","[0.00019687559688463807, -0.003950582817196846..."
1,https://www.uscis.gov/working-in-the-united-st...,Petition Process Overview,"Form I-129, Petition for Nonimmigrant Worker","Petition Process Overview Form I-129, Petition...",If you would like to come to the United States...,"[2746, 499, 1053, 1093, 311, 2586, 311, 279, 3...",267,0,"[-0.0002513806102797389, 0.008037553168833256,...","[0.00017619578284211457, -0.00466615054756403,..."
2,https://www.uscis.gov/working-in-the-united-st...,Petition Process Overview,"Form I-140, Immigrant Petition for Alien Workers","Petition Process Overview Form I-140, Immigran...","Below is the list of Form I-140, Immigrant Pet...","[39314, 374, 279, 1160, 315, 3459, 358, 12, 68...",136,0,"[0.0035785867366939783, 0.005471774842590094, ...","[-0.0015572593547403812, 0.007850014604628086,..."
3,https://www.uscis.gov/working-in-the-united-st...,Petition Process Overview,"Form I-360, Petition for Amerasian, Widow(er),...","Petition Process Overview Form I-360, Petition...","Below is the list of Form I-360, Petition for ...","[39314, 374, 279, 1160, 315, 3459, 358, 12, 68...",39,0,"[-0.005390014499425888, 0.009419900365173817, ...","[-0.013340615667402744, 0.001092378981411457, ..."
4,https://www.uscis.gov/working-in-the-united-st...,Petition Process Overview,"Form I-526, Immigrant Petition by Alien Investor","Petition Process Overview Form I-526, Immigran...","Below is the list of Form I-526, Immigrant Pet...","[39314, 374, 279, 1160, 315, 3459, 358, 12, 22...",167,0,"[0.0012103705666959286, -0.01343610230833292, ...","[-0.008694147691130638, -0.011578872799873352,..."
...,...,...,...,...,...,...,...,...,...,...
193,https://www.uscis.gov/working-in-the-united-st...,EB-5 What's New,EB-5 What's New,EB-5 What's New EB-5 What's New,This page provides the latest information on t...,"[2028, 2199, 5825, 279, 5652, 2038, 389, 279, ...",1459,0,"[-0.019623059779405594, -0.01592286303639412, ...","[-0.006295501254498959, -0.0017567670438438654..."
194,https://www.uscis.gov/working-in-the-united-st...,EB-5 Regional Center Compliance Reviews,Compliance Review Team Tasks,EB-5 Regional Center Compliance Reviews Compli...,This page in Simplified Chinese. Regional cent...,"[2028, 2199, 304, 62342, 1908, 8620, 13, 26361...",246,0,"[-0.0015812900383025408, -0.023748794570565224...","[0.011764729395508766, -0.0029344288632273674,..."
195,https://www.uscis.gov/working-in-the-united-st...,EB-5 Regional Center Compliance Reviews,Preparing for a Compliance Review,EB-5 Regional Center Compliance Reviews Prepar...,"Before the site assessment, regional centers s...","[10438, 279, 2816, 15813, 11, 15481, 19169, 12...",128,0,"[0.007674211170524359, -0.014007426798343658, ...","[-0.003573805559426546, 0.009384548291563988, ..."
196,https://www.uscis.gov/working-in-the-united-st...,EB-5 Regional Center Compliance Reviews,After Completing the Review,EB-5 Regional Center Compliance Reviews After ...,The review team will document the results in a...,"[791, 3477, 2128, 690, 2246, 279, 3135, 304, 2...",63,0,"[0.0009468668140470982, -0.013154685497283936,...","[0.004156357143074274, -0.0028048832900822163,..."


In [32]:
df_windows.to_csv('5_embeddings.csv')