# NLTK
___


#### 📦 One-Time Setup (NLTK Resources)

In [None]:
# NLTK needs to download items onces
import nltk
nltk.download( 'punkt' )
nltk.download( 'stopwords' )
nltk.download( 'wordnet' )
nltk.download( 'omw-1.4' )

##### Load Dependencies

In [8]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import re
import os
from pathlib import Path
import openai
import pandas as pd
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sentence_transformers
import sqlite3
import numpy as np
import pickle
import unicodedata
from bs4 import BeautifulSoup
from textblob import Word
from collections import Counter


# Text Processing

#### ✅ Checklist

| Step | Task                     | Function/Library               |
|------|--------------------------|--------------------------------|
| 1    | Load Text                | `open()`, `pandas.read_csv()` |
| 2    | Convert to Lowercase     | `.lower()`                     |
| 3    | Remove Punctuation       | `string.punctuation`           |
| 4    | Remove Numbers           | `re.sub()`                     |
| 5    | Trim Whitespaces         | `' '.join()`                   |
| 6    | Tokenization             | `nltk.word_tokenize()`         |
| 7    | Remove Stopwords         | `nltk.corpus.stopwords`        |
| 8    | Lemmatization/Stemming   | `WordNetLemmatizer`, `PorterStemmer` |
| 9    | Reconstruct Clean Text   | `' '.join()`                   |
| 10   | (Optional) Spellcheck    | `TextBlob.correct()`           |
| 11   | Vectorization for ML     | `TfidfVectorizer`, `CountVectorizer`, Word Embeddings |


___

### 1.  Load File

In [None]:
# === Load Raw Text ===
file_path = '<url to file>'
_rawtext = ''


def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		_rawtext = f.read( )
		return _rawtext


### 2.  Clean Whitespace
- Consecutive whitespace reduced to a single space
- Leading/trailing spaces removed
- Blank lines removed

In [None]:
def clean_whitespace( text: str ) -> str:
    """

        Removes extra spaces and blank lines from the input text.

        Parameters:
        -----------
        text : str
            The raw input text string to be cleaned.

        Returns:
        --------
        str
            A cleaned text string with:
                - Consecutive whitespace reduced to a single space
                - Leading/trailing spaces removed
                - Blank lines removed

    """
    # Replace multiple spaces or tabs with a single space
    text = re.sub(r'[ \t]+', ' ', text)

    # Remove leading/trailing spaces from each line
    lines = [line.strip() for line in text.splitlines()]

    # Remove empty lines
    cleaned_lines = [line for line in lines if line]

    # Join lines back into a single string
    cleaned_text = '\n'.join(cleaned_lines)

    return cleaned_text

### 3. Normalize

In [None]:
def normalize( text: str ) -> str:
    """

        Normalizes the input text string.

        This function:
          - Converts text to lowercase
          - Removes accented characters (e.g., é -> e)
          - Removes leading/trailing spaces
          - Collapses multiple whitespace characters into a single space

        Parameters:
        -----------
        text : str
            The raw input text string to be normalized.

        Returns:
        --------
        str
            A normalized, cleaned version of the input string.

    """
    # Convert to lowercase
    _lower = text.lower( )

    # Remove accented characters using Unicode normalization
    _unicode = unicodedata.normalize( 'NFKD', _lower ).encode( 'ascii', 'ignore' ).decode( 'utf-8' )

    # Trim leading/trailing spaces and collapse_whitespace internal whitespace
    _normalized = re.sub( r'\s+', ' ', _unicode ).strip( )

    return _normalized

### 4. Remove Punctuation

In [None]:
def remove_punctuation( text: str ) -> str:
    """

        Removes all punctuation characters from the input text string.

        Parameters:
        -----------
        text : str
            The input text string to be cleaned.

        Returns:
        --------
        str
            The text string with all punctuation removed.

    """
    # Create a translation table that maps punctuation to None
    translator = str.maketrans( '', '', string.punctuation )

    # Apply the translation to the text
    cleaned_text = text.translate( translator )

    return cleaned_text

### 5. Trim Whitespace

- Removes leading and trailing whitespace
- Replaces multiple internal spaces with a single space

In [None]:
def trim_whitespace( text: str ) -> str:
    """

        Trims whitespace from the input text string.

        This function:
          - Removes leading and trailing whitespace
          - Replaces multiple internal spaces with a single space

        Parameters:
        -----------
        text : str
            The raw input string with potential extra whitespace.

        Returns:
        --------
        str
            The cleaned string with trimmed and normalized whitespace.

    """
    # Strip leading and trailing whitespace
    text = text.strip()

    # Replace multiple whitespace characters (spaces, tabs, etc.) with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)

    return cleaned_text

### 6. Lemmatize
- Reduces words to their base or root form.
- Converts text to lowercase
- Tokenizes the text into words
- Lemmatizes each token using WordNetLemmatizer
- Reconstructs the lemmatized tokens into a single string

In [None]:
def lemmatize(text: str) -> str:
    """

        Performs lemmatization on the input text string.

        This function:
          - Converts text to lowercase
          - Tokenizes the text into words
          - Lemmatizes each token using WordNetLemmatizer
          - Reconstructs the lemmatized tokens into a single string

        Parameters:
        -----------
        text : str
            The input text string to be lemmatized.

        Returns:
        --------
        str
            A string with all words lemmatized.

    """
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    lower_case = text.lower( )
    # Convert to lowercase and tokenize
    tokens = word_tokenize( lower_case )

    # Lemmatize each token
    lemmatized_tokens = [ lemmatizer.lemmatize( token ) for token in tokens ]

    # Join tokens back to a string
    lemmatized_text = ' '.join( lemmatized_tokens )

    return lemmatized_text

### 7. Tokenize

- Converts text to lowercase
- Uses NLTK's word_tokenize to split the text into words and punctuation tokens

In [None]:
def tokenize( text: str ) -> list:
    """

        Tokenizes the input text string into individual word tokens.

        This function:
          - Converts text to lowercase
          - Uses NLTK's word_tokenize to split the text into words and punctuation tokens

        Parameters:
        -----------
        text : str
            The raw input text string to be tokenized.

        Returns:
        --------
        list
            A list of tokens (words and punctuation) extracted from the text.

    """
    # Convert to lowercase
    _lower = text.lower( )

    # Tokenize
    tokens = word_tokenize( _lower )

    return tokens


### 8. Remove Special Characters
- Retains only alphanumeric characters and whitespace
- Removes symbols like @, #, $, %, &, etc.
- Preserves letters, numbers, and spaces

In [None]:
def remove_special( text: str ) -> str:
    """

        Removes special characters from the input text string.

        This function:
          - Retains only alphanumeric characters and whitespace
          - Removes symbols like @, #, $, %, &, etc.
          - Preserves letters, numbers, and spaces

        Parameters:
        -----------
        text : str
            The raw input text string potentially containing special characters.

        Returns:
        --------
        str
            A cleaned string containing only letters, numbers, and spaces.

    """
    # Use regex to replace all non-alphanumeric characters (excluding spaces) with empty string
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    return cleaned_text

### 9. Remove HTML
- Parses the text as HTML
- Extracts and returns only the visible content without tags

In [None]:
def remove_html_tags( text: str ) -> str:
    """

        Removes HTML tags from the input text string.

        This function:
          - Parses the text as HTML
          - Extracts and returns only the visible content without tags

        Parameters:
        -----------
        text : str
            The input text containing HTML tags.

        Returns:
        --------
        str
            A cleaned string with all HTML tags removed.

    """
    # Parse HTML and extract text
    soup = BeautifulSoup(text, "raw_html.parser")
    cleaned_text = soup.get_text(separator=' ', strip=True)

    return cleaned_text

### 10. Chunk Tokens
- Tokenizes the text into words
- Groups them into consecutive word chunks
- Returns a list of strings (each chunk)

In [None]:
def chunk_tokens( text: list, chunk_size: int = 50) -> list:
    """

        Breaks a list of cleaned, tokenized strings into chunks of a specified number of tokens.

        This function:
          - Flattens the input list of tokenized strings (i.e., list of lists)
          - Groups tokens into chunks of length `chunk_size`
          - Returns a list of token chunks, each as a list of tokens

        Parameters:
        -----------
        text : list of tokenizd words
            The input list where each element is a list of tokens (words).

        chunk_size : int, optional (default=50)
            Number of tokens per chunk.

        Returns:
        --------
        list
            A list of token chunks. Each chunk is a list of tokens.

    """
    # Flatten the list of token lists into a single list
    all_tokens = [ token for sublist in text for token in sublist ]

    # Create chunks of tokens
    chunks = [
        all_tokens[i:i + chunk_size]
        for i in range(0, len(all_tokens), chunk_size)
    ]

    return chunks

### 11. Chunk Text

- Converts text to lowercase
- Tokenizes text using NLTK's word_tokenize
- Breaks tokens into chunks of a specified size
- Optionally joins tokens into strings (for transformer models)

In [None]:
def chunk_text( text: str, chunk_size: int = 50, return_as_string: bool = True ) -> list:
    """

        Tokenizes cleaned text and breaks it into chunks for downstream embeddings.

        This function:
          - Converts text to lowercase
          - Tokenizes text using NLTK's word_tokenize
          - Breaks tokens into chunks of a specified size
          - Optionally joins tokens into strings (for transformer models)

        Parameters:
        -----------
        text : str
            The cleaned input text to be tokenized and chunked.

        chunk_size : int, optional (default=50)
            Number of tokens per chunk.

        return_string : bool, optional (default=True)
            If True, returns each chunk as a string; otherwise, returns a list of tokens.

        Returns:
        --------
        list
            A list of token chunks. Each chunk is either a list of tokens or a string.

    """
    # Download tokenizer models (only once)
    nltk.download('punkt', quiet=True)

    # Tokenize the text into words
    tokens = word_tokenize( text.lower( ) )

    # Create chunks of specified token length
    token_chunks = [
        tokens[i:i + chunk_size]
        for i in range(0, len(tokens), chunk_size)
    ]

    # Optionally join tokens into strings
    if return_as_string:
        return [' '.join(chunk) for chunk in token_chunks]
    else:
        return token_chunks

### 12. Remove Errors
- Converts text to lowercase
- Tokenizes the text into words
- Filters out words not recognized as valid English using TextBlob
- Returns a string with only correctly spelled words

In [None]:
def remove_errors( text: str ) -> str:
    """

        Removes misspelled or non-English words from the input text.

        This function:
          - Converts text to lowercase
          - Tokenizes the text into words
          - Filters out words not recognized as valid English using TextBlob
          - Returns a string with only correctly spelled words

        Parameters:
        -----------
        text : str
            The input text to clean.

        Returns:
        --------
        str
            A cleaned string containing only valid English words.

    """

    # Download NLTK resources (only once)
    nltk.download('punkt', quiet=True)

    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())

    # Keep only correctly spelled words (as per Word dictionary in TextBlob)
    cleaned_tokens = [word for word in tokens if Word(word).spellcheck()[0][1] > 0.9]

    # Return cleaned string
    return ' '.join(cleaned_tokens)

### 13. Correct Errors
- Converts text to lowercase
- Tokenizes the text into words
- Applies spelling correction using TextBlob
- Reconstructs and returns the corrected text

In [None]:
def correct_errors(text: str) -> str:
    """

        Corrects misspelled words in the input text string.

        This function:
          - Converts text to lowercase
          - Tokenizes the text into words
          - Applies spelling correction using TextBlob
          - Reconstructs and returns the corrected text

        Parameters:
        -----------
        text : str
            The input text string with potential spelling mistakes.

        Returns:
        --------
        str
            A corrected version of the input string with proper English words.

    """
    # Download tokenizer values (only once)
    nltk.download('punkt', quiet=True)

    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())

    # Apply spelling correction to each token
    corrected_tokens = [str(Word(word).correct()) for word in tokens]

    # Join the corrected words into a single string
    corrected_text = ' '.join(corrected_tokens)

    return corrected_text

### 14.  Remove Headers
- Assumes repeated lines at the top or bottom (like titles, page numbers)
- Removes lines that are common across multiple pages (heuristic)
- Returns cleaned text with main body content only

In [None]:
def remove_headers_footers(text: str) -> str:
    """
    Removes common headers and footers from a text document.

    This function:
      - Assumes repeated lines at the top or bottom (like titles, page numbers)
      - Removes lines that are common across multiple pages (heuristic)
      - Returns cleaned text with main body content only

    Parameters:
    -----------
    text : str
        The input text potentially containing headers/footers.

    Returns:
    --------
    str
        The cleaned text with headers and footers removed.
    """
    # Split the text into lines
    lines = text.splitlines()

    # Remove empty lines and trim whitespace
    lines = [line.strip() for line in lines if line.strip()]

    # Count line frequencies to identify repeated headers/footers
    line_counts = Counter(lines)

    # Identify frequent lines (appear in >1% of total lines)
    threshold = max(1, int(len(lines) * 0.01))
    repeated_lines = {line for line, count in line_counts.items() if count > threshold}

    # Remove lines that are likely headers or footers
    body_lines = [line for line in lines if line not in repeated_lines]

    # Reconstruct the cleaned text
    cleaned_text = '\n'.join(body_lines)

    return cleaned_text


### 15.  Remove Formatting
- Strips HTML tags
- Removes Markdown syntax (e.g., *, #, [], etc.)
- Collapses whitespace (newlines, tabs)
- Optionally removes special characters for clean unformatted text

In [None]:
def remove_formatting( text: str ) -> str:
    """

        Removes formatting artifacts (Markdown, HTML, control characters) from text.

        This function:
          - Strips HTML tags
          - Removes Markdown syntax (e.g., *, #, [], etc.)
          - Collapses whitespace (newlines, tabs)
          - Optionally removes special characters for clean unformatted text

        Parameters:
        -----------
        text : str
            The formatted input text.

        Returns:
        --------
        str
            A cleaned version of the text with formatting removed.

    """
    # Remove HTML tags
    text = BeautifulSoup(text, "raw_html.parser").get_text(separator=' ', strip=True)

    # Remove Markdown syntax
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)           # Markdown links
    text = re.sub(r'[`_*#~>-]', '', text)                # Markdown chars
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)          # Markdown images

    # Remove control characters and normalize whitespace
    text = re.sub(r'[\r\n\t]+', ' ', text)               # Newlines, tabs
    text = re.sub(r'\s+', ' ', text).strip()             # Collapse multiple spaces

    return text

### 16. Remove Stopwords
- Tokenizes the input text
- Removes common stopwords (e.g., "the", "is", "and", etc.)
- Returns the text with only meaningful words

In [None]:
def remove_stopwords(text: str) -> str:
    """

        Removes English stopwords from the input text string.

        This function:
          - Tokenizes the input text
          - Removes common stopwords (e.g., "the", "is", "and", etc.)
          - Returns the text with only meaningful words

        Parameters:
        -----------
        text : str
            The input text string.

        Returns:
        --------
        str
            A cleaned version of the input text without stopwords.

    """
    # Download required NLTK resources (only once)
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)

    # Define English stopword set
    stop_words = set(stopwords.words('english'))

    # Tokenize and lowercase
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Join tokens back into a string
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text


### 🔍 Usage

In [None]:
def clean_text( text: str ) -> str:
    # Step 1: Normalize normalized
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # Step 2: Remove page headers and footers (Public Law-specific)
    text = re.sub(r'PUBLIC LAW 118–32.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Remove page numbers between lines

    # Step 3: Remove hyphenation at line breaks (e.g., 'appropria-\ntion')
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Step 4: Merge broken lines where sentence continues
    text = re.sub(r'(?<!\n)\n(?![\n])', ' ', text)

    # Step 5: Collapse excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

### 🔍 Pipeline

In [None]:
# === Preprocessing Configuration ===
EN_STOPWORDS = set( stopwords.words( 'english' ) )
LEMMATIZER = WordNetLemmatizer( )
STEMMER = SnowballStemmer( 'english' )

def preprocess_line( line, lower=True, punctuation=True,
                     stopwords=False, lemmatize=True, stem=False ):
    '''
    Process a single line of documents with optional steps:
    - lower
    - punctuation removal
    - stopword removal
    - lemmatization
    - stemming (optional)
    Returns the cleaned line as a string.
    '''
    tokens = word_tokenize( line )
    processed = [ ]
    for token in tokens:
        if lower:
            token = token.lower( )

        if punctuation and token in string.punctuation:
            continue

        if stopwords and token in EN_STOPWORDS:
            continue

        if lemmatize:
            token = LEMMATIZER.lemmatize( token )

        if stem:
            token = STEMMER.stem( token )

        processed.append( token )

    return ' '.join( processed )


def process_file( file_path, **preprocess_kwargs ):
    '''
        Read a documents file line-by-line, apply preprocessing pipeline to each line,
        and return a list of cleaned lines (original order preserved).
        Pass any keyword args supported by `preprocess_line`.
    '''
    cleaned_lines = []
    with open( file_path, 'r', encoding='utf-8' ) as file:
        for line in file:
            cleaned = preprocess_line( line, **preprocess_kwargs )
            cleaned_lines.append( cleaned )
    return cleaned_lines


#  Vectorization (Preparation for ML Models)
___





##### Load Dependencies

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel
import torch

In [None]:
# === Load Raw Text ===
file_path = '<url to file>'
_rawtext = ''

def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		_rawtext = f.read( )
		return _rawtext

### 🧮 1. Bag of Words (BoW) using CountVectorizer

In [None]:


corpus = ['Bro loves clean code.', 'Code is life.']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print( vectorizer.get_feature_names_out( ) )
print( X.toarray() )


### 📊 2. TF-IDF using TfidfVectorizer

In [None]:


corpus = [ 'Bro writes awesome code.', 'Code must be clean and clear.' ]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(X.toarray())


### 🧠 3. Word2Vec using gensim

In [None]:


sentences = [ [ 'bro', 'loves', 'python'], ['clean', 'code', 'rocks' ] ]
model = Word2Vec( sentences, vector_size=100, window=5, min_count=1, workers=4 )

# Vector for the word 'bro'
vector = model.wv[ 'bro' ]
print(vector)


### 🌍 4. GloVe using gensim (with pre-trained vectors)


In [None]:


# Load GloVe vectors (convert .txt to .word2vec format beforehand if needed)
glove_file = 'glove.6B.100d.word2vec.txt'
model = KeyedVectors.load_word2vec_format(glove_file, binary=False)

# Vector for the word 'code'
vector = model['code']
print(vector)


### 🤖 5. BERT / Transformer-based Embeddings using transformers + torch


In [None]:


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

sentence = "Bro's code always works."
inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)

# Get the vector for [CLS] token (sentence embedding)
sentence_embedding = outputs.last_hidden_state[:, 0, :]
print(sentence_embedding.shape)


#### Clean Document

In [None]:
def clean_text( text: str ) -> str:
    # Step 1: Normalize normalized
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # Step 2: Remove page headers and footers (Public Law-specific)
    text = re.sub(r'PUBLIC LAW 118–32.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Remove page numbers between lines

    # Step 3: Remove hyphenation at line breaks (e.g., 'appropria-\ntion')
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Step 4: Merge broken lines where sentence continues
    text = re.sub(r'(?<!\n)\n(?![\n])', ' ', text)

    # Step 5: Collapse excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


#### 3. Chunk File

In [None]:
# Simple chunking by words assuming ~1.3 words per token
def chunk_text( text, max_tokens=512 ):
	words = text.split( )
	chunk_size = int( max_tokens * 1.3 )
	chunks = [ ' '.join( words[ i:i + chunk_size ] ) for i in range( 0, len( words ), chunk_size ) ]
	return chunks




#  OpenAI Embedding
___

##### API key

In [None]:
# Create client
client = OpenAI( )
client.api_key = os.getenv( 'OPENAI_API_KEY' )

#### 1. Define embedding function

In [None]:
def embed_texts( texts, model='documents-embedding-3-small', batch_size=10, sleep=1 ):
    embeddings = []
    for i in range( 0, len( texts ), batch_size ):
        batch = texts[ i:i+batch_size ]
        try:
            response = openai.embeddings.create( input=batch, model=model )
            batch_embeddings = [ e.embedding for e in response.data ]
            embeddings.extend( batch_embeddings )
        except Exception as e:
            print( f'Error at batch {i}: {e}' )
            # Retry or sleep to avoid rate limits
            time.sleep( sleep )
            continue

    return embeddings


#### 2. Embed chunks

In [None]:
# 2. Embed chunks
embeddings = embed_texts( chunks )

#### 3.  Create DataFrame

In [None]:
# 3. Create DataFrame
df_embeddings = pd.DataFrame( { chunks, embeddings } )


#### 3. Save


In [None]:
# 3. Save
df_embeddings.to_parquet( 'public_law_118_32_embeddings.parquet', index=False )


#### 4. Preview

In [None]:
# 4. Preview
df_embeddings.head(2)


### 3. Generate Embeddings
- Use a language model (e.g., OpenAI, HuggingFace) to create vector representations of each chunk.

In [None]:


model = SentenceTransformer( 'all-MiniLM-L6-v2' )
embeddings = model.encode( chunks, show_progress_bar=True )


### 4. Create SQLite Database

- Design a table that links text chunks to their embeddings.

In [None]:
conn = sqlite3.connect( 'embeddings.values' )
cursor = conn.cursor( )
sql_create = '''
CREATE TABLE IF NOT EXISTS law_embeddings
(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    chunk TEXT NOT NULL,
    embedding BLOB NOT NULL
)
'''

cursor.execute( sql_create )

for chunk, vector in zip( chunks, embeddings ):
    blob = pickle.dumps( vector )
    cursor.execute( 'INSERT INTO law_embeddings ( chunk, embedding ) VALUES (?, ?)', ( chunk, blob ) )

conn.commit( )
conn.close( )


###  Retrieval (Vector Search in SQLite)

- You can perform semantic search by encoding a query and comparing via cosine similarity


In [None]:
def cosine_similarity( a, b ):
    return np.dot( a, b ) / ( np.linalg.norm( a ) * np.linalg.norm( b ) )

In [None]:
query = 'Appropriations for Department of Defense'
query_vec = model.encode( [ query ] )[ 0 ]

conn = sqlite3.connect( 'embeddings.values' )
cursor = conn.cursor( )
cursor.execute( 'SELECT id, chunk, embedding FROM law_embeddings' )

results = []
for row in cursor.fetchall( ):
    chunk_id, chunk_text, blob = row
    stored_vec = pickle.loads( blob )
    sim = cosine_similarity( query_vec, stored_vec )
    results.append( ( sim, chunk_text ) )

# Sort and get top N
top_matches = sorted( results, key=lambda x: x[ 0 ], reverse=True )[ :5 ]


#   Embedding-Pipeline Script
___

##### Load Dependencies



In [3]:
import re
import sqlite3
import pickle
import numpy as np
from tqdm import tqdm
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter


#### Configuration

In [None]:
# Define paths
TEXT_FILE = 'PublicLaw_118-42.txt'
DB_FILE = 'law_embeddings.values'
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


#### Load and Clean Raw Text

In [None]:
def load_and_clean_text( filepath ):
    with open( filepath, 'r', encoding='utf-8' ) as file:
        raw_text = file.read( )

    # Basic normalization
    text = re.sub( r'\f+', ' ', raw_text )
    text = re.sub( r'\n+', ' ', text )
    text = re.sub( r'\s{2,}', ' ', text )
    return text.strip( )


#### Generate Embeddings

In [None]:
def get_embedding( text, model=OPENAI_MODEL ):
    response = openai.Embedding.generate_text( input=text, model=model )
    return response[ 'values' [ 0 ][ 'embedding' ] ]


def embed_chunks( chunks ):
    embeddings = [ ]
    for chunk in tqdm( chunks, desc='EmbeddingRequest chunks via OpenAI' ):
        try:
            embedding = get_embedding( chunk )
            embeddings.append( embedding )
        except Exception as e:
            print( f'Error embedding chunk: {e}' )
            embeddings.append( [ 0.0 ] * 1536 )  # Placeholder for failed requests
    return embeddings


##### Create SQLite DB



In [None]:
def create_and_populate_db( chunks, embeddings, db_path ):
    conn = sqlite3.connect( db_path )
    cursor = conn.cursor( )
    sql_create = '''
    CREATE TABLE IF NOT EXISTS law_embeddings
    (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        chunk TEXT NOT NULL,
        embedding BLOB NOT NULL
    )
    '''

    cursor.execute( sql_create )
    for chunk, vector in zip( chunks, embeddings ):
        blob = pickle.dumps( vector )
        sql_insert = 'INSERT INTO law_embeddings ( chunk, embedding ) VALUES ( ?, ? )'
        cursor.execute( sql_insert, ( chunk, blob ) )

    conn.commit( )
    conn.close( )


#### Script

In [None]:
# === MAIN ===
def main():
    print('Step 1: Load and clean documents')
    cleaned_text = load_and_clean_text(TEXT_FILE)

    print('Step 2: Chunking documents')
    chunks = chunk_text(cleaned_text)
    print(f'Total chunks: {len(chunks)}')

    print('Step 3: EmbeddingRequest with OpenAI API')
    embeddings = embed_chunks(chunks)

    print('Step 4: Saving to SQLite')
    create_and_populate_db(chunks, embeddings, DB_FILE)

    print(f'Pipeline complete. Embeddings stored in: {DB_FILE}')


if __name__ == '__main__':
    main()

E

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# === 1. Load Model ===
# You can try other models like 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', or 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer( 'all-MiniLM-L6-v2' )

# === 2. Embed Chunks ===
def embed_with_sentence_transformers( texts, model ):
    return model.encode( texts, show_progress_bar=True, convert_to_numpy=True )

local_embeddings = embed_with_sentence_transformers( chunks, model )

# === 3. Save in a DataFrame ===
df_local = pd.DataFrame({
    'chunk': chunks,
    'embedding': list( local_embeddings )  # numpy arrays to list for DataFrame compatibility
})

# === 4. Save to Disk ===
df_local.to_parquet( 'public_law_118_32_local_embeddings.parquet', index=False )

# === 5. Preview ===
df_local.head( 2 )



## Fine-Tuning

In [None]:
import json
import openai
import os
import pandas as pd
from pprint import pprint


In [None]:

client = openai.OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
    organization='<org id>',
    project='<project id>',
)

In [None]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned to only contain documents from www.cookbooks.com
recipe_df = pd.read_csv('data/cookbook_recipes_nlg_10k.csv')

recipe_df.head()

In [None]:
system_message = 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'


def create_user_message(row):
    return f'Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients: '


def prepare_example_conversation(row):
    return {
        'messages': [
            {'role': 'system', 'content': system_message},
            {'role': 'user', 'content': create_user_message(row)},
            {'role': 'assistant', 'content': row['NER']},
        ]
    }



In [None]:
# use the first 100 rows of the dataset for training
training_df = recipe_df.loc[0:100]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:5]:
    print(example)

In [None]:
validation_df = recipe_df.loc[101:200]
validation_data = validation_df.apply(
    prepare_example_conversation, axis=1).tolist()

In [None]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, 'w') as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + '\n'
            out.write(jout)

In [None]:
training_file_name = 'tmp_recipe_finetune_training.jsonl'
write_jsonl(training_data, training_file_name)

validation_file_name = 'tmp_recipe_finetune_validation.jsonl'
write_jsonl(validation_data, validation_file_name)

In [None]:
def upload_file(file_name: str, purpose: str) -> str:
    with open(file_name, 'rb') as file_fd:
        response = client.files.create(file=file_fd, purpose=purpose)
    return response.id

In [None]:
training_file_id = upload_file(training_file_name, 'fine-tune')
validation_file_id = upload_file(validation_file_name, 'fine-tune')

In [None]:
MODEL = 'gpt-4o-mini-2024-07-18'

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model=MODEL,
    suffix='recipe-ner',
)

job_id = response.id