# NLTK
___


#### 📦 One-Time Setup (NLTK Resources)

In [None]:

import nltk
nltk.download( 'punkt' )
nltk.download( 'stopwords' )
nltk.download( 'wordnet' )
nltk.download( 'omw-1.4' )


##### Load Dependencies

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer

#### 🧠 Full Pipeline

In [None]:
# === Preprocessing Configuration ===
EN_STOPWORDS = set( stopwords.words( 'english' ) )
LEMMATIZER = WordNetLemmatizer( )
STEMMER = SnowballStemmer( "english" )

def preprocess_line( line, lower=True, punctuation=True,
                     stopwords=False, lemmatize=True, stem=False ):
    """
    Process a single line of text with optional steps:
    - lower
    - punctuation removal
    - stopword removal
    - lemmatization
    - stemming (optional)
    Returns the cleaned line as a string.
    """
    tokens = word_tokenize( line )
    processed = [ ]
    for token in tokens:
        if lower:
            token = token.lower( )

        if punctuation and token in string.punctuation:
            continue

        if stopwords and token in EN_STOPWORDS:
            continue

        if lemmatize:
            token = LEMMATIZER.lemmatize( token )

        if stem:
            token = STEMMER.stem( token )

        processed.append( token )

    return ' '.join( processed )


def process_file( file_path, **preprocess_kwargs ):
    """
        Read a text file line-by-line, apply preprocessing pipeline to each line,
        and return a list of cleaned lines (original order preserved).
        Pass any keyword args supported by `preprocess_line`.
    """
    cleaned_lines = []
    with open( file_path, 'r', encoding='utf-8' ) as file:
        for line in file:
            cleaned = preprocess_line( line, **preprocess_kwargs )
            cleaned_lines.append( cleaned )
    return cleaned_lines


#### 🔍 Usage

In [None]:
file_path = 'path/to/Public_Law_118-32.txt'
cleaned_lines = process_file( file_path, lowercase=True, remove_punct=True,
    remove_stopwords=False, lemmatize=True, stem=True )

print( f"Total lines: {len( cleaned_lines )}" )
print( "Example cleaned line:", cleaned_lines[ 0 ] )

## Text Cleaner for PL 118-32

##### Load Dependencies

In [6]:
import re
import os
from pathlib import Path
import openai
import pandas as pd
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter



#### 1. Load File

In [None]:
# === 1. Load the Raw Text ===
def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		return f.read( )

#### 2. Clean Document

In [None]:
def clean_text( text: str ) -> str:
    # Step 1: Normalize normalized
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # Step 2: Remove page headers and footers (Public Law-specific)
    text = re.sub(r'PUBLIC LAW 118–32.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Remove page numbers between lines

    # Step 3: Remove hyphenation at line breaks (e.g., "appropria-\ntion")
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Step 4: Merge broken lines where sentence continues
    text = re.sub(r'(?<!\n)\n(?![\n])', ' ', text)

    # Step 5: Collapse excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


#### 3. Chunk File

In [None]:
# Simple chunking by words assuming ~1.3 words per token
def chunk_text( text, max_tokens=512 ):
	words = text.split( )
	chunk_size = int( max_tokens * 1.3 )
	chunks = [ ' '.join( words[ i:i + chunk_size ] ) for i in range( 0, len( words ), chunk_size ) ]
	return chunks


##### 🔍 Example

In [None]:
file_path = 'path_to/Public_Law_118-32.txt'
raw_text = load_text( file_path )
cleaned_text = clean_text( raw_text )
chunks = chunk_text( cleaned_text )
print( f'Total Chunks: {len( chunks )}' )
print( 'Sample chunk:\n', chunks[ 0 ][ :1000 ] )



#  OpenAI Embedding
___

##### API key

In [None]:
# Create client
client = OpenAI( )
client.api_key = os.getenv( 'OPENAI_API_KEY' )

#### 1. Define embedding function

In [None]:
def embed_texts( texts, model='text-embedding-3-small', batch_size=10, sleep=1 ):
    embeddings = []
    for i in range( 0, len( texts ), batch_size ):
        batch = texts[ i:i+batch_size ]
        try:
            response = openai.embeddings.create( input=batch, model=model )
            batch_embeddings = [ e.embedding for e in response.data ]
            embeddings.extend( batch_embeddings )
        except Exception as e:
            print( f'Error at batch {i}: {e}' )
            # Retry or sleep to avoid rate limits
            time.sleep( sleep )
            continue

    return embeddings


#### 2. Embed chunks

In [None]:
# 2. Embed chunks
embeddings = embed_texts( chunks )

#### 3.  Create DataFrame

In [None]:
# 3. Create DataFrame
df_embeddings = pd.DataFrame( { chunks, embeddings } )


#### 3. Save


In [None]:
# 3. Save
df_embeddings.to_parquet( 'public_law_118_32_embeddings.parquet', index=False )


#### 4. Preview

In [None]:
# 4. Preview
df_embeddings.head(2)

# Text Cleaning
___

##### Load Dependencies

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import sentence_transformers
import sqlite3
import numpy as np
import pickle

RuntimeError: Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
cannot import name 'float8_e4m3b11fnuz' from 'tensorflow.python.framework.dtypes' (C:\Users\terry\source\compilers\py\anaconda\Lib\site-packages\tensorflow\python\framework\dtypes.py)

#### 1.  Strip irrelevant content  while preserving structure.

In [None]:
def clean_text( text ):
    # Remove form feeds, line breaks, etc.
    text = re.sub( r'\f+', ' ', text )
    text = re.sub( r'\n+', ' ', text )
    text = re.sub( r'\s{2,}', ' ', text )

    # Normalize Section markers, Title headers, etc.
    text = re.sub( r'SECTION\.\s+(\d+)\.', r'Section \1:', text )
    return text.strip( )


#### 2. Chunk the Text

- Large documents need to be chunked (for context window limits during embedding).
- Use semantic or structural chunking.

In [None]:


seps =[ '\n\n', '\n', '.', ' ' ]
splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=seps )
cleaned = clean_text( raw_text )
chunks = splitter.split_text( cleaned )


#### 3. Generate Embeddings

- Use a language model (e.g., OpenAI, HuggingFace) to create vector representations of each chunk.

In [None]:


model = SentenceTransformer( 'all-MiniLM-L6-v2' )
embeddings = model.encode( chunks, show_progress_bar=True )


#### 4. Create SQLite Database

- Design a table that links text chunks to their embeddings.

In [None]:
conn = sqlite3.connect( 'embeddings.db' )
cursor = conn.cursor( )
sql_create = '''
CREATE TABLE IF NOT EXISTS law_embeddings
(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    chunk TEXT NOT NULL,
    embedding BLOB NOT NULL
)
'''

cursor.execute( sql_create )

for chunk, vector in zip( chunks, embeddings ):
    blob = pickle.dumps( vector )
    cursor.execute( 'INSERT INTO law_embeddings ( chunk, embedding ) VALUES (?, ?)', ( chunk, blob ) )

conn.commit( )
conn.close( )


####  Retrieval (Vector Search in SQLite)

- You can perform semantic search by encoding a query and comparing via cosine similarity:

In [None]:
def cosine_similarity( a, b ):
    return np.dot( a, b ) / ( np.linalg.norm( a ) * np.linalg.norm( b ) )

In [None]:
query = 'Appropriations for Department of Defense'
query_vec = model.encode( [ query ] )[ 0 ]

conn = sqlite3.connect( 'embeddings.db' )
cursor = conn.cursor( )
cursor.execute( 'SELECT id, chunk, embedding FROM law_embeddings' )

results = []
for row in cursor.fetchall( ):
    chunk_id, chunk_text, blob = row
    stored_vec = pickle.loads( blob )
    sim = cosine_similarity( query_vec, stored_vec )
    results.append( ( sim, chunk_text ) )

# Sort and get top N
top_matches = sorted( results, key=lambda x: x[ 0 ], reverse=True )[ :5 ]


#   Embedding-Pipeline Script
___

##### Load Dependencies



In [3]:
import re
import sqlite3
import pickle
import numpy as np
from tqdm import tqdm
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter

##### Configuration

In [None]:
# Define paths
TEXT_FILE = 'PublicLaw_118-42.txt'
DB_FILE = 'law_embeddings.db'
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


##### Load and Clean Raw Text

In [None]:
def load_and_clean_text( filepath ):
    with open( filepath, 'r', encoding='utf-8' ) as file:
        raw_text = file.read( )

    # Basic normalization
    text = re.sub( r'\f+', ' ', raw_text )
    text = re.sub( r'\n+', ' ', text )
    text = re.sub( r'\s{2,}', ' ', text )
    return text.strip( )

##### Chunk the Clean Text


In [None]:
def chunk_text( clean_text ):
    splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200,
	    separators=[ '\n\n', '\n', '.', ' ' ] )

    return splitter.split_text( clean_text )


##### Generate Embeddings

In [None]:
def get_embedding( text, model=OPENAI_MODEL ):
    response = openai.Embedding.create( input=text, model=model )
    return response[ 'data' [ 0 ][ 'embedding' ] ]


def embed_chunks( chunks ):
    embeddings = [ ]
    for chunk in tqdm( chunks, desc='EmbeddingRequest chunks via OpenAI' ):
        try:
            embedding = get_embedding( chunk )
            embeddings.append( embedding )
        except Exception as e:
            print( f'Error embedding chunk: {e}' )
            embeddings.append( [ 0.0 ] * 1536 )  # Placeholder for failed requests
    return embeddings

##### Create SQLite DB



In [None]:
def create_and_populate_db( chunks, embeddings, db_path ):
    conn = sqlite3.connect( db_path )
    cursor = conn.cursor( )
    sql_create = '''
    CREATE TABLE IF NOT EXISTS law_embeddings
    (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        chunk TEXT NOT NULL,
        embedding BLOB NOT NULL
    )
    '''

    cursor.execute( sql_create )
    for chunk, vector in zip( chunks, embeddings ):
        blob = pickle.dumps( vector )
        sql_insert = 'INSERT INTO law_embeddings ( chunk, embedding ) VALUES ( ?, ? )'
        cursor.execute( sql_insert, ( chunk, blob ) )

    conn.commit( )
    conn.close( )


#### Script

In [None]:
# === MAIN ===
def main():
    print('Step 1: Load and clean text')
    cleaned_text = load_and_clean_text(TEXT_FILE)

    print('Step 2: Chunking text')
    chunks = chunk_text(cleaned_text)
    print(f'Total chunks: {len(chunks)}')

    print('Step 3: EmbeddingRequest with OpenAI API')
    embeddings = embed_chunks(chunks)

    print('Step 4: Saving to SQLite')
    create_and_populate_db(chunks, embeddings, DB_FILE)

    print(f'Pipeline complete. Embeddings stored in: {DB_FILE}')


if __name__ == '__main__':
    main()

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# === 1. Load Model ===
# You can try other models like 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', or 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer( 'all-MiniLM-L6-v2' )

# === 2. Embed Chunks ===
def embed_with_sentence_transformers( texts, model ):
    return model.encode( texts, show_progress_bar=True, convert_to_numpy=True )

local_embeddings = embed_with_sentence_transformers( chunks, model )

# === 3. Save in a DataFrame ===
df_local = pd.DataFrame({
    'chunk': chunks,
    'embedding': list( local_embeddings )  # numpy arrays to list for DataFrame compatibility
})

# === 4. Save to Disk ===
df_local.to_parquet( 'public_law_118_32_local_embeddings.parquet', index=False )

# === 5. Preview ===
df_local.head( 2 )



## Fine-Tuning

In [None]:
import json
import openai
import os
import pandas as pd
from pprint import pprint


In [None]:

client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
    organization="<org id>",
    project="<project id>",
)

In [None]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned to only contain documents from www.cookbooks.com
recipe_df = pd.read_csv("data/cookbook_recipes_nlg_10k.csv")

recipe_df.head()

In [None]:
system_message = "You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided."


def create_user_message(row):
    return f"Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients: "


def prepare_example_conversation(row):
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": create_user_message(row)},
            {"role": "assistant", "content": row["NER"]},
        ]
    }



In [None]:
# use the first 100 rows of the dataset for training
training_df = recipe_df.loc[0:100]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:5]:
    print(example)

In [None]:
validation_df = recipe_df.loc[101:200]
validation_data = validation_df.apply(
    prepare_example_conversation, axis=1).tolist()

In [None]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [None]:
training_file_name = "tmp_recipe_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

validation_file_name = "tmp_recipe_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)

In [None]:
def upload_file(file_name: str, purpose: str) -> str:
    with open(file_name, "rb") as file_fd:
        response = client.files.create(file=file_fd, purpose=purpose)
    return response.id

In [None]:
training_file_id = upload_file(training_file_name, "fine-tune")
validation_file_id = upload_file(validation_file_name, "fine-tune")

In [None]:
MODEL = "gpt-4o-mini-2024-07-18"

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model=MODEL,
    suffix="recipe-ner",
)

job_id = response.id