# NLTK
___


#### 📦 One-Time Setup (NLTK Resources)

In [None]:
import io
import nltk
from nltk.corpus import words
from isort.format import remove_whitespace

nltk.download( 'punkt' )
nltk.download( 'punkt_tab' )
nltk.download( 'stopwords' )
nltk.download( 'wordnet' )
nltk.download( 'omw-1.4' )
nltk.download( 'words' )


##### Load Dependencies

In [170]:
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import re
import os
from pathlib import Path
import openai as OpenAI
import pandas as pd
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sqlite3
import numpy as np
import pickle
import unicodedata
from bs4 import BeautifulSoup
from textblob import Word
from collections import Counter
from typing import Any, List, Tuple, Optional, Union, Dict
import ipywidgets as widgets, IPython, platform, ipywidgets, jupyterlab
from importlib import reload


In [171]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Text Processing

#### ✅ Checklist

| Step | Task                     | Function/Library               |
|------|--------------------------|--------------------------------|
| 1    | Load Text                | `open()`, `pandas.read_csv()` |
| 2    | Convert to Lowercase     | `.lower()`                     |
| 3    | Remove Punctuation       | `string.punctuation`           |
| 4    | Remove Numbers           | `re.sub()`                     |
| 5    | Trim Whitespaces         | `' '.join()`                   |
| 6    | Tokenization             | `nltk.word_tokenize()`         |
| 7    | Remove Stopwords         | `nltk.corpus.stopwords`        |
| 8    | Lemmatization/Stemming   | `WordNetLemmatizer`, `PorterStemmer` |
| 9    | Reconstruct Clean Text   | `' '.join()`                   |
| 10   | (Optional) Spellcheck    | `TextBlob.correct()`           |
| 11   | Vectorization for ML     | `TfidfVectorizer`, `CountVectorizer`, Word Embeddings |


___

### 1.  Load File

In [172]:
# === Load Raw Text ===
file_path = '<url to file>'
_rawtext = ''


def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		_rawtext = f.read( )
		return _rawtext


### 2.  Clean Space
- Consecutive whitespace reduced to a single space
- Leading/trailing spaces removed
- Blank lines removed

In [422]:
def clean_space( text: str ) -> str:
	"""

		This function:
		_____________
        Removes extra spaces and blank tokens from the path pages.

        Parameters:
        -----------
        pages : str
            The raw path pages path to be cleaned_lines.

        Returns:
        --------
        str
            A cleaned_lines pages path with:
                - Consecutive whitespace reduced to a single space
                - Leading/trailing spaces removed
                - Blank tokens removed

    """
	# Replace multiple spaces or tabs with a single space
	text = re.sub( r'\t+', ' ', text )
	text = re.sub( r'\s+', ' ', text )
	return text

### 3. Normalize

In [174]:
def normalize( text: str ) -> str:
	"""

        This function: Normalizes the path pages path.
          - Converts pages to lowercase
          - Removes accented characters (e.g., é -> e)
          - Removes leading/trailing spaces
          - Collapses multiple whitespace characters into a single space

        Parameters:
        -----------
        pages : str
            The raw path pages path to be normalized.

        Returns:
        --------
        str
            A normalized, cleaned_lines version of the path path.

    """
	return unicodedata.normalize( 'NFKD', text ).encode( 'ascii', 'ignore' ).decode( 'utf-8' )

### 4. Remove Punctuation

In [301]:
def remove_punctuation( text: str ) -> str:
    """

		Remove non-alphanumeric characters
		and extra whitespace from text.

		Args:
			text (str): The text text string.

		Returns:
			str: Cleaned text string with only alphanumeric characters and spaces.

    """
    text = re.sub( r'[^a-zA-Z0-9\s]', ' ', text )
    text = re.sub( r'\s+', ' ', text ).strip( )
    return text

### 5. Trim Whitespace

- Removes leading and trailing whitespace
- Replaces multiple internal spaces with a single space

In [408]:
def trim_whitespace( text: str ) -> str:
	"""

		Purpose:
		---------
        This function:
          - Removes leading and trailing whitespace
          - Replaces multiple internal spaces with a single space

        Parameters:
        -----------
        pages : str
            The raw path path with potential extra whitespace.

        Returns:
        --------
        str
            The cleaned_lines path with trimmed and normalized whitespace.

    """
	# Replace multiple whitespace characters (spaces, tabs, etc.) with a single space
	cleaned_text = re.sub( r'\s+', ' ', text )
	return cleaned_text

### 6. Lemmatize
- Reduces words to their base or root form.
- Converts text to lowercase
- Tokenizes the text into words
- Lemmatizes each token using WordNetLemmatizer
- Reconstructs the lemmatized tokens into a single string

In [177]:
def lemmatize( text: str ) -> str:
	"""

        Performs lemmatization on the path pages path.

        This function:
          - Converts pages to lowercase
          - Tokenizes the pages into words
          - Lemmatizes each token using WordNetLemmatizer
          - Reconstructs the lemmatized tokens into a single path

        Parameters:
        -----------
        pages : str
            The path pages path to be lemmatized.

        Returns:
        --------
        str
            A path with all words lemmatized.

    """
	# Initialize lemmatizer
	lemmatizer = WordNetLemmatizer( )

	lower_case = text.lower( )
	# Convert to lowercase and tokenize_text
	tokens = word_tokenize( lower_case )

	# Lemmatize each token
	lemmatized_tokens = [ lemmatizer.lemmatize( token ) for token in tokens ]

	# Join tokens back to a path
	lemmatized_text = ' '.join( lemmatized_tokens )

	return lemmatized_text

### 7. Tokenize

- Converts text to lowercase
- Uses NLTK's word_tokenize to split the text into words and punctuation tokens

In [178]:
def tokenize( words: List[ str ] ) -> List[ str ]:
	"""

        This function:
          - Tokenizes the path pages path into individual word tokens.
          - Converts pages to lowercase
          - Uses NLTK's word_tokenize to split the pages into words and punctuation tokens

        Parameters:
        -----------
        words : List[ str ]
            A list of strings to be tokenized.

        Returns:
        --------
            A list of token strings (words and punctuation) extracted from the pages.

    """
	# Convert to lowercase
	tokens = [ ]
	for w in words:
		_token = nltk.word_tokenize( w )
		tokens.append( _token )
	return tokens


### 8. Remove Special
- Retains only alphanumeric characters and whitespace
- Removes symbols like @, #, $, %, &, etc.
- Preserves letters, numbers, and spaces

In [441]:
def remove_special( text: str ) -> str:
	"""

        Remove all special characters
        from the text string, keeping only letters, digits, and whitespace.

        Args:
            text (str): The text string to process.

        Returns:
            str: The processed string with special characters removed.

    """
	cleaned = [ ]
	keepers = [ '$', '. ', '; ', ': ' ]
	for char in text:
		if char in keepers:
			cleaned.append( char )
		elif char == '--':
			cleaned.append( ' ' )
		elif char == '-':
			cleaned.append( ' ' )
		elif char.isalnum( ) or char == ' ':
			cleaned.append( char )

	return ''.join( cleaned )

In [346]:
def remove_numerals( chunks: List[ str ] ) -> str:
	"""

        Remove all special characters
        from the text string, keeping only letters, digits, and whitespace.

        Args:
            text (str): The text string to process.

        Returns:
            str: The processed string with special characters removed.

    """
	cleaned = [ ]
	keepers = [  ' i',  'ii', 'iii', 'iv', ' v', 'vi', 'vii', 'viii', 'ix' ]
	for text in chunks:
		if text.lower() in keepers:
			cleaned.append( ' ' )
		else:
			cleaned.append( text.lower( ) )

	return ''.join( cleaned )

In [180]:
def remove_special_characters( text: str, keep_spaces: bool=True ) -> str:
	"""

		Purpose:
		_______
		Remove special characters from the text.

		Args:
			text (str): Input string to clean.
			keep_spaces (bool): If True, preserves spaces; otherwise removes all non-alphanumerics.

		Returns:
			str: Cleaned text with only alphanumeric characters (and optionally spaces).
	"""
	if keep_spaces:
		return re.sub( r'[^a-zA-Z0-9\s]', ' ', text )
	else:
		return re.sub( r'[^a-zA-Z0-9]', ' ', text )

### 9. Remove HTML
- Parses the text as HTML
- Extracts and returns only the visible content without tags

In [181]:
def remove_html_tags( text: str ) -> str:
	"""


        This function:
        Removes HTML tags from the path pages path.
          - Parses the pages as HTML
          - Extracts and returns only the visible content without tags

        Parameters:
        -----------
        pages : str
            The path pages containing HTML tags.

        Returns:
        --------
        str
            A cleaned_lines path with all HTML tags removed.

    """
	# Parse HTML and extract pages
	soup = BeautifulSoup( text, "raw_html.parser" )
	cleaned_text = soup.get_text( separator=' ', strip=True )

	return cleaned_text

### 10. Chunk Words
- Tokenizes the text into words
- Groups them into consecutive word chunks
- Returns a list of strings (each chunk_words)

In [182]:
def chunk_words( text: List[ str ], chunk_size: int=50 ) -> List[ List[ str ] ]:
	"""

        Breaks a list of cleaned_lines,
        tokenized strings into
        chunks of a specified num of tokens.

        This function:
          - Flattens the path get_list of tokenized strings (i.e., get_list of lists)
          - Groups tokens into chunks of min `chunk_size`
          - Returns a list of lists of tokens

        Parameters:
        -----------
        pages : get_list of tokenizd words
            The path get_list where each element is a get_list of tokens (words).

        chunk_size : int, optional (default=250)
            Number of tokens per chunk_words.

        Returns:
        --------
        List[ List[ str ] ]
            A list of a list of token chunks. Each chunk is a list of tokens.

    """
	# Flatten the get_list of token lists into a single list
	all_tokens = [ token for sublist in text for token in sublist ]

	# Create chunks of tokens
	chunks = [ all_tokens[ i : i + chunk_size ] for i in range( 0, len( all_tokens ), chunk_size ) ]

	return chunks

### 10. Chunk Pages

In [424]:
def chunk_pages( text: List[ str ], chunk_size: int=250 ) -> List[ str ]:
	"""

        Breaks a list of cleaned_lines,
        tokenized strings into
        chunks of a specified num of tokens.

        This function:
          - Flattens the path get_list of tokenized strings (i.e., get_list of lists)
          - Groups tokens into chunks of min `chunk_size`
          - Returns a list of lists of tokens

        Parameters:
        -----------
        pages : get_list of tokenizd words
            The path get_list where each element is a get_list of tokens (words).

        chunk_size : int, optional (default=250)
            Number of tokens per chunk_words.

        Returns:
        --------
        List[ List[ str ] ]
            A list of a list of token chunks. Each chunk is a list of tokens.

    """
	# Flatten the get_list of token lists into a single list
	all_tokens = [ token for sublist in text for token in sublist ]

	# Create chunks of tokens
	token_chunks = [ all_tokens[ i : i + chunk_size ] for i in range( 0, len( all_tokens ), chunk_size ) ]

	return [ ''.join( chunk ) for chunk in token_chunks ]

### 11. Chunk Text

- Converts text to lowercase
- Tokenizes text using NLTK's word_tokenize
- Breaks tokens into chunks of a specified size
- Optionally joins tokens into strings (for transformer models)

In [184]:
def chunk_text( text: str, chunk_size: int=50, return_as_string: bool=True ) -> List[ str ]:
	"""

        This function:
        Tokenizes cleaned_lines pages and breaks it into chunks for downstream vectors.
          - Converts pages to lowercase
          - Tokenizes pages using NLTK's word_tokenize
          - Breaks tokens into chunks of a specified size
          - Optionally joins tokens into strings (for transformer models)

        Parameters:
        -----------
        pages : str
            The cleaned_lines path pages to be tokenized and chunked.

        chunk_size : int, optional (default=50)
            Number of tokens per chunk_words.

        return_string : bool, optional (default=True)
            If True, returns each chunk_words as a path; otherwise, returns a get_list of tokens.

        Returns:
        --------
        get_list
            A get_list of token chunks. Each chunk_words is either a get_list of tokens or a path.

    """
	# Download tokenizer models (only once)
	tokens = nltk.word_tokenize( text )
	token_chunks = [ tokens[ i:i + chunk_size ] for i in range( 0, len( tokens ), chunk_size ) ]
	return [ ' '.join( chunk ) for chunk in token_chunks ]

### 12. Remove Errors
- Converts text to lowercase
- Tokenizes the text into words
- Filters out words not recognized as valid English using TextBlob
- Returns a string with only correctly spelled words

In [185]:
def remove_errors( text: str ) -> str:
    """

		Purpose:
		________
		Remove all non-English
		words but preserve valid numbers
		using the NLTK English vocabulary.

		Args:
			text (str): The text text string.

		Returns:
			str: The cleaned string with only English words and numbers.

    """
    vocabulary = set( w.lower() for w in words.words() )
    keepers = [ '(', ')', '$', '.', ';', ':', ' - '  ]

    # Tokenize: includes words and numbers
    tokens = re.findall(r'\b[\w.]+\b', text.lower( ))

    # Keep words in vocab or numbers
    def is_valid_token( tok: str ) -> bool:
        return ( tok in vocabulary
                or tok.isdigit( )
                or tok in keepers )

    valid_tokens = [ tok for tok in tokens if is_valid_token( tok ) ]
    return ' '.join( valid_tokens )

### 13. Correct Errors
- Converts text to lowercase
- Tokenizes the text into words
- Applies spelling correction using TextBlob
- Reconstructs and returns the corrected text

In [186]:
def correct_errors( text: str ) -> str:
	"""

        Corrects misspelled words in the path pages path.

        This function:
          - Converts pages to lowercase
          - Tokenizes the pages into words
          - Applies spelling correction using TextBlob
          - Reconstructs and returns the corrected pages

        Parameters:
        -----------
        pages : str
            The path pages path with potential spelling mistakes.

        Returns:
        --------
        str
            A corrected version of the path path with proper English words.

    """
	# Convert to lowercase and tokenize_text
	tokens = word_tokenize( text.lower( ) )

	# Apply spelling correction to each token
	corrected_tokens = [ str( Word( word ).correct( ) ) for word in tokens ]

	# Join the corrected words into a single path
	corrected_text = ' '.join( corrected_tokens )

	return corrected_text

### 14.  Remove Headers
- Assumes repeated lines at the top or bottom (like titles, page numbers)
- Removes lines that are common across multiple pages (heuristic)
- Returns cleaned text with main body content only

In [187]:
def remove_headers_footers( text: str ) -> str:
	"""
    Removes common headers and footers from a pages document.

    This function:
      - Assumes repeated tokens at the top or bottom (like titles, page numbers)
      - Removes tokens that are common across multiple pages (heuristic)
      - Returns cleaned_lines pages with main body content only

    Parameters:
    -----------
    pages : str
        The path pages potentially containing headers/footers.

    Returns:
    --------
    str
        The cleaned_lines pages with headers and footers removed.
    """
	# Split the pages into tokens
	lines = text.splitlines( )

	# Remove empty tokens and trim whitespace
	lines = [ line.strip( ) for line in lines if line.strip( ) ]

	# Count line frequencies to identify repeated headers/footers
	line_counts = Counter( lines )

	# Identify frequent tokens (appear in >1% of total tokens)
	threshold = max( 1, int( len( lines ) * 0.01 ) )
	repeated_lines = { line for line, count in line_counts.items( ) if count > threshold }

	# Remove tokens that are likely headers or footers
	body_lines = [ line for line in lines if line not in repeated_lines ]

	# Reconstruct the cleaned_lines pages
	cleaned_text = '\n'.join( body_lines )

	return cleaned_text


### 15.  Remove Formatting
- Strips HTML tags
- Removes Markdown syntax (e.g., *, #, [], etc.)
- Collapses whitespace (newlines, tabs)
- Optionally removes special characters for clean unformatted text

In [188]:
def remove_formatting( text: str ) -> str:
	"""

        Removes formatting artifacts (Markdown, HTML, control characters) from pages.

        This function:
          - Strips HTML tags
          - Removes Markdown syntax (e.g., *, #, [], etc.)
          - Collapses whitespace (newlines, tabs)
          - Optionally removes special characters for clean unformatted pages

        Parameters:
        -----------
        pages : str
            The formatted path pages.

        Returns:
        --------
        str
            A cleaned_lines version of the pages with formatting removed.

    """
	# Remove HTML tags
	text = BeautifulSoup( text, "raw_html.parser" ).get_text( separator=' ', strip=True )

	# Remove Markdown syntax
	text = re.sub( r'\[.*?\]\(.*?\)', '', text )  # Markdown links
	text = re.sub( r'[`_*#~>-]', '', text )  # Markdown chars
	text = re.sub( r'!\[.*?\]\(.*?\)', '', text )  # Markdown images

	# Remove and normalize whitespace
	text = re.sub( r'[\r\n\t]+', ' ', text )  # Newlines, tabs
	text = re.sub( r'\s+', ' ', text ).strip( )  # Collapse multiple spaces

	return text

### 16. Remove Stopwords
- Tokenizes the input text
- Removes common stopwords (e.g., "the", "is", "and", etc.)
- Returns the text with only meaningful words

### 17. Split Sentences

In [189]:
def split_sentences( text: str ) -> List[ str] :
    """

		Purpose:
		________
		Splits the text text string into a list of
		individual sentences using NLTK's Punkt sentence tokenizer.
		This function is useful for preparing text for further linguistic processing,
		such as tokenization, parsing, or named entity recognition.

		Parameters
		----------
		text : str
			The raw text string to be segmented into sentences.

		Returns
		-------
		List[str]
			A list of sentence strings, each corresponding to a single sentence detected
			in the text text.

    """
    return nltk.sent_tokenize( text )

### 🔍 Usage

In [190]:
def clean_text( text: str ) -> str:
	_first = text.replace( '\r\n', '\n' ).replace( '\r', '\n' )
	_second = re.sub( r'\n\s*\d+\s*\n', '\n', _first )
	_third = re.sub( r'(\w+)-\n(\w+)', r'\1\2', _second )
	_fouth = re.sub( r'(?<!\n)\n(?![\n])', ' ', _third )
	_retval = re.sub( r'\s+', ' ', _fouth )
	return _retval

### 🔍 Pipeline

In [19]:
# === Preprocessing Configuration ===
EN_STOPWORDS = set( stopwords.words( 'english' ) )
LEMMATIZER = WordNetLemmatizer( )
STEMMER = SnowballStemmer( 'english' )


def preprocess_line( line, lower=True, punctuation=True,
                     stopwords=False, lemmatize=True, stem=False ):
	'''
		Process a single line of documents with optional steps:
		- lower
		- punctuation removal
		- stopword removal
		- lemmatization
		- stemming (optional)
		Returns the cleaned_lines line as a path.
    '''
	tokens = word_tokenize( line )
	processed = [ ]
	for token in tokens:
		if lower:
			token = token.lower( )

		if punctuation and token in string.punctuation:
			continue

		if stopwords and token in EN_STOPWORDS:
			continue

		if lemmatize:
			token = LEMMATIZER.lemmatize( token )

		if stem:
			token = STEMMER.stem( token )

		processed.append( token )

	return ' '.join( processed )


def process_file( file_path, **preprocess_kwargs ):
	'''
        Read a documents file line-by-line, apply preprocessing pipeline to each line,
        and return a get_list of cleaned_lines tokens (original order preserved).
        Pass any keyword args supported by `preprocess_line`.
    '''
	cleaned_lines = [ ]
	with open( file_path, 'r', encoding='utf-8' ) as file:
		for line in file:
			cleaned = preprocess_line( line, **preprocess_kwargs )
			cleaned_lines.append( cleaned )
	return cleaned_lines


#  Vectorization (Preparation for ML Models)
___





##### Load Dependencies

In [27]:
# === Load Raw Text ===
file_path = '<url to file>'
_rawtext = ''


def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		_rawtext = f.read( )
		return _rawtext

### 🧮 1. Bag of Words (BoW) using CountVectorizer

In [29]:
corpus = [ 'Bro loves clean code.', 'Code is life.' ]
vectorizer = CountVectorizer( )
X = vectorizer.fit_transform( corpus )

print( vectorizer.get_feature_names_out( ) )
print( X.toarray( ) )


['bro' 'clean' 'code' 'is' 'life' 'loves']
[[1 1 1 0 0 1]
 [0 0 1 1 1 0]]


### 📊 2. TF-IDF using TfidfVectorizer

In [30]:
corpus = [ 'Bro writes awesome code.', 'Code must be clean and clear.' ]
vectorizer = TfidfVectorizer( )
X = vectorizer.fit_transform( corpus )

print( vectorizer.get_feature_names_out( ) )
print( X.toarray( ) )


['and' 'awesome' 'be' 'bro' 'clean' 'clear' 'code' 'must' 'writes']
[[0.         0.53404633 0.         0.53404633 0.         0.
  0.37997836 0.         0.53404633]
 [0.4261596  0.         0.4261596  0.         0.4261596  0.4261596
  0.30321606 0.4261596  0.        ]]


### 🧠 3. Word2Vec using gensim

In [31]:
sentences = [ [ 'bro', 'loves', 'python' ], [ 'clean', 'code', 'rocks' ] ]
model = Word2Vec( sentences, vector_size=100, window=5, min_count=1, workers=4 )

# VectorStore for the word 'bro'
vector = model.wv[ 'bro' ]
print( vector )


[-8.7274825e-03  2.1301615e-03 -8.7354420e-04 -9.3190884e-03
 -9.4281426e-03 -1.4107180e-03  4.4324086e-03  3.7040710e-03
 -6.4986930e-03 -6.8730675e-03 -4.9994122e-03 -2.2868442e-03
 -7.2502876e-03 -9.6033178e-03 -2.7436293e-03 -8.3628409e-03
 -6.0388758e-03 -5.6709289e-03 -2.3441375e-03 -1.7069972e-03
 -8.9569986e-03 -7.3519943e-04  8.1525063e-03  7.6904297e-03
 -7.2061159e-03 -3.6668312e-03  3.1185520e-03 -9.5707225e-03
  1.4764392e-03  6.5244664e-03  5.7464195e-03 -8.7630618e-03
 -4.5171441e-03 -8.1401607e-03  4.5956374e-05  9.2636338e-03
  5.9733056e-03  5.0673080e-03  5.0610625e-03 -3.2429171e-03
  9.5521836e-03 -7.3564244e-03 -7.2703874e-03 -2.2653891e-03
 -7.7856064e-04 -3.2161034e-03 -5.9258583e-04  7.4888230e-03
 -6.9751858e-04 -1.6249407e-03  2.7443992e-03 -8.3591007e-03
  7.8558037e-03  8.5361041e-03 -9.5840869e-03  2.4462664e-03
  9.9049713e-03 -7.6658037e-03 -6.9669187e-03 -7.7365171e-03
  8.3959233e-03 -6.8133592e-04  9.1444086e-03 -8.1582209e-03
  3.7430846e-03  2.63504

### 🌍 4. GloVe using gensim (with pre-trained vectors)


In [None]:
# Load GloVe vec (convert .txt to .word2vec format beforehand if needed)
glove_file = r'C:\Users\terry\source\llm\glove\glove.6B.100d.txt'
model = KeyedVectors.load_word2vec_format( glove_file, unicode_errors='ignore' )

# VectorStore for the word 'code'
vector = model[ 'code' ]
print( vector )


### 🤖 5. BERT / Transformer-based Embeddings using transformers + torch


In [33]:
tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' )
model = BertModel.from_pretrained( 'bert-base-uncased' )

sentence = "Bro's code always works."
inputs = tokenizer( sentence, return_tensors='pt' )
outputs = model( **inputs )

# Get the vector for [CLS] token (sentence embedding)
sentence_embedding = outputs.last_hidden_state[ :, 0, : ]
print( sentence_embedding.shape )


torch.Size([1, 768])


#### Clean Document

In [35]:
def clean_text( text: str ) -> str:
	text = text.replace( '\r\n', '\n' ).replace( '\r', '\n' )
	text = re.sub( r'\n\s*\d+\s*\n', '\n', text )
	text = re.sub( r'(\w+)-\n(\w+)', r'\1\2', text )
	text = re.sub( r'(?<!\n)\n(?![\n])', ' ', text )
	text = re.sub( r'\s+', ' ', text )
	return text.strip( )




#  OpenAI Embedding
___

##### API key

In [None]:
# Create client
client = OpenAI( )
client.api_key = os.getenv( 'OPENAI_API_KEY' )

#### 1. Define embedding function

In [32]:
def embed_texts( texts, model='text-embedding-3-small', batch_size=10, sleep=1 ):
	embeddings = [ ]
	for i in range( 0, len( texts ), batch_size ):
		batch = texts[ i:i + batch_size ]
		try:
			response = openai.embeddings.create( input=batch, model=model )
			batch_embeddings = [ e.embedding for e in response.data ]
			embeddings.extend( batch_embeddings )
		except Exception as e:
			print( f'Error at batch {i}: {e}' )
			# Retry or sleep to avoid rate limits
			time.sleep( sleep )
			continue

	return embeddings


#### 2. Embed chunks

In [None]:
# 2. Embed chunks
embeddings = embed_texts( chunks )

#### 3.  Create DataFrame

In [None]:
# 3. Create DataFrame
df_embeddings = pd.DataFrame( { chunks, embeddings } )


#### 3. Save


In [None]:
# 3. Save
df_embeddings.to_parquet( 'public_law_118_32_embeddings.parquet', index=False )


#### 4. Preview

In [None]:
# 4. Preview
df_embeddings.head( 2 )


### 3. Generate Embeddings
- Use a language model (e.g., OpenAI, HuggingFace) to create vector representations of each chunk_words.

In [None]:
model = SentenceTransformer( 'all-MiniLM-L6-v2' )
embeddings = model.encode( chunks, show_progress_bar=True )


### 4. Create SQLite Database

- Design a table that links text chunks to their embeddings.

In [None]:
conn = sqlite3.connect( 'vectors.target_values' )
cursor = conn.cursor( )
sql_create = '''
CREATE TABLE IF NOT EXISTS Law_Embeddings
(
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Chunk_Tokens TEXT NOT NULL,
    Embedding BLOB NOT NULL
)
'''

cursor.execute( sql_create )

for chunk, vector in zip( chunks, embeddings ):
	blob = pickle.dumps( vector )
	cursor.execute( 'INSERT INTO Law_Embeddings ( Chunk_Tokens, Embedding ) VALUES (?, ?)',
		(chunk, blob) )

conn.commit( )
conn.close( )


###  Retrieval (Vector Search in SQLite)

- You can perform semantic search by encoding a query and comparing via cosine similarity


In [None]:
def cosine_similarity( a, b ):
	return np.dot( a, b ) / (np.linalg.norm( a ) * np.linalg.norm( b ))

In [None]:
query = 'Appropriations for Department of Defense'
query_vec = model.encode( [ query ] )[ 0 ]

conn = sqlite3.connect( 'vectors.target_values' )
cursor = conn.cursor( )
cursor.execute( 'SELECT Id, Chunk_Tokens, Embedding FROM Law_Embeddings' )

results = [ ]
for row in cursor.fetchall( ):
	chunk_id, chunk_text, blob = row
	stored_vec = pickle.loads( blob )
	sim = cosine_similarity( query_vec, stored_vec )
	results.append( (sim, chunk_text) )

# Sort and get top N
top_matches = sorted( results, key=lambda x: x[ 0 ], reverse=True )[ :5 ]


#   Embedding-Pipeline Script
___

##### Load Dependencies



In [33]:
import re
import sqlite3
import numpy as np
from tqdm import tqdm
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter


#### Configuration

In [None]:
# Define paths
TEXT_FILE = 'PublicLaw_118-42.txt'
DB_FILE = 'law_embeddings.target_values'
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


#### Load and Clean Raw Text

In [None]:
def load_and_clean_text( filepath ):
	with open( filepath, 'r', encoding='utf-8', errors='ignore' ) as file:
		raw_text = file.read( )

	# Basic normalization
	text = re.sub( r'\f+', ' ', raw_text )
	text = re.sub( r'\n+', ' ', text )
	text = re.sub( r'\s{2,}', ' ', text )
	return text.strip( )


#### Generate Embeddings

In [None]:
def get_embedding( text, model=OPENAI_MODEL ):
	response = openai.Embedding.generate_text( input=text, model=model )
	return response[ 'target_values'[ 0 ][ 'embedding' ] ]


def embed_chunks( chunks ):
	embeddings = [ ]
	for chunk in tqdm( chunks, desc='EmbeddingRequest chunks via OpenAI' ):
		try:
			embedding = get_embedding( chunk )
			embeddings.append( embedding )
		except Exception as e:
			print( f'Error embedding chunk_words: {e}' )
			embeddings.append( [ 0.0 ] * 1536 )  # Placeholder for failed requests
	return embeddings


##### Create SQLite DB



In [None]:
def create_and_populate_db( chunks, embeddings, db_path ):
	conn = sqlite3.connect( db_path )
	cursor = conn.cursor( )
	sql_create = '''
    CREATE TABLE IF NOT EXISTS Law_Embeddings
    (
        Id INTEGER PRIMARY KEY AUTOINCREMENT,
        Chunk_Tokens TEXT NOT NULL,
        Embedding BLOB NOT NULL
    )
    '''

	cursor.execute( sql_create )
	for chunk, vector in zip( chunks, embeddings ):
		blob = pickle.dumps( vector )
		sql_insert = 'INSERT INTO Law_Embeddings ( Chunk_Tokens, Embedding ) VALUES ( ?, ? )'
		cursor.execute( sql_insert, (chunk, blob) )

	conn.commit( )
	conn.close( )


#### Script

In [None]:
# === MAIN ===
def main( ):
	print( 'Step 1: Load and clean documents' )
	cleaned_text = load_and_clean_text( TEXT_FILE )

	print( 'Step 2: Chunking documents' )
	chunks = chunk_text( cleaned_text )
	print( f'Total chunks: {len( chunks )}' )

	print( 'Step 3: EmbeddingRequest with OpenAI API' )
	embeddings = embed_chunks( chunks )

	print( 'Step 4: Saving to SQLite' )
	create_and_populate_db( chunks, embeddings, DB_FILE )

	print( f'Pipeline complete. Embeddings stored in: {DB_FILE}' )


if __name__ == '__main__':
	main( )

#### Embeddings

In [194]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd


# === 1. Load Model ===
# You can try other models like 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', or 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer( 'all-MiniLM-L6-v2' )


# === 2. Embed Chunks ===
def embed_with_sentence_transformers( texts, model ):
	return model.encode( texts, show_progress_bar=True, convert_to_numpy=True )


local_embeddings = embed_with_sentence_transformers( chunks, model )

# === 3. Save in a DataFrame ===
df_local = pd.DataFrame(
{
	'chunk_words': chunks,
	'embedding': list( local_embeddings )  # numpy arrays to a list for DataFrame compatibility
} )

# === 4. Save to Disk ===
df_local.to_parquet( 'public_law_118_32_local_embeddings.parquet', index=False )

# === 5. Preview ===
df_local.head( 2 )



KeyboardInterrupt: 

## Fine-Tuning

In [193]:
import json
import openai
import os
import pandas as pd
from pprint import pprint


In [None]:

client = openai.OpenAI(
	api_key=os.environ.get( 'OPENAI_API_KEY' ),
	organization='<org id>',
	project='<project id>',
)

In [37]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned_lines to only contain documents from www.cookbooks.com
recipe_df = pd.read_csv( r'C:\Users\terry\Desktop\cookbook_recipes_nlg_10k.csv' )
recipe_df.head( )

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,www.cookbooks.com,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,www.cookbooks.com,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,www.cookbooks.com,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,www.cookbooks.com,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,www.cookbooks.com,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [None]:
system_message = 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'

def create_user_message( row ):
	return f'Title: {row[ 'title' ]}\n\nIngredients: {row[ 'ingredients' ]}\n\nGeneric ingredients: '


def prepare_example_conversation( row ):
	return \
	{
		'messages': [
		{
			'role': 'system',
			'content': system_message
		},
		{
			'role': 'user',
			'content': create_user_message( row )
		},
		{
			'role': 'assistant',
			'content': row[ 'NER' ]
		}, ]
	}



In [None]:
# use the first 100 rows of the dataset for training
training_df = recipe_df.loc[ 0:100 ]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply( prepare_example_conversation, axis=1 ).tolist( )

for example in training_data[ :5 ]:
	print( example )

In [None]:
validation_df = recipe_df.loc[ 101:200 ]
validation_data = validation_df.apply(
	prepare_example_conversation, axis=1 ).tolist( )

In [None]:
def write_jsonl( data: List[ Dict ], filename: str ) -> None:
	with open( filename, 'w' ) as out:
		for kvp in data:
			jout = json.dumps( kvp ) + '\n'
			out.write( jout )

In [None]:
training_file_name = 'tmp_recipe_finetune_training.jsonl'
write_jsonl( training_data, training_file_name )

validation_file_name = 'tmp_recipe_finetune_validation.jsonl'
write_jsonl( validation_data, validation_file_name )

In [None]:
def upload_file( file_name: str, purpose: str ) -> str:
	with open( file_name, 'rb' ) as file_fd:
		response = client.files.create( file=file_fd, purpose=purpose )
	return response.id

In [None]:
training_file_id = upload_file( training_file_name, 'fine-tune' )
validation_file_id = upload_file( validation_file_name, 'fine-tune' )

In [None]:
MODEL = 'openai-4o-mini-2024-07-18'

response = client.fine_tuning.jobs.create(
	training_file=training_file_id,
	validation_file=validation_file_id,
	model=MODEL,
	suffix='recipe-ner',
)

job_id = response.id

# Text Cleaning Pipeline

<module 'tigrr' from 'C:\\Users\\terry\\source\\repos\\Boo\\src\\tigrr.py'>

## Preprocessing

In [None]:
for i in range( 10 ):
	print( lines[ i ] )

In [331]:
new = r'C:\Users\terry\Desktop\Text\Chunked'  + '\\' + filename
folder = open( new, 'wt+' )
processed = [ ]
for i, c in enumerate( lines ):
	part = ' '.join( c )
	line = '{ ' + f'"{i}"' + ' : ' + '"' + part + '"' + ' },' + '\r'
	processed.append( line )

for line in processed:
	folder.write( line )

folder.close( )

In [195]:
from importlib import reload
import tigrr as tgr
reload( tgr )
from tigrr import Text

In [233]:
src_text = r'C:\Users\terry\Desktop\Budget\Guidance\Regulations\Text'
src = r'C:\Users\terry\Desktop\Test\Text'
dest_cleaned = r'C:\Users\terry\Desktop\Budget\Guidance\Regulations\Cleaned'
dest = r'C:\Users\terry\Desktop\Test\Cleaned'

In [24]:
txtr = Text( )

In [435]:
def clean_files( src: str, dest: str ) -> None:
	try:
		if src is None:
			raise Exception( 'The argument "src" is required.' )
		elif dest is None:
			raise Exception( 'The argument "dest" is required.' )
		else:
			source = src
			destination = dest
			files = os.listdir( source )
			for f in files:
				processed = [ ]
				filename = os.path.basename( f )
				source_path = source + '\\' + filename
				text = open( source_path, 'r', encoding='utf-8', errors='ignore' ).read( )
				sentences = split_sentences( text)
				for s in sentences:
					if s != " ":
						lower = s.lower( )
						special = remove_special( lower )
						space = clean_space( special )
						processed.append( space )

				dest_path = destination + '\\' + filename
				clean = open( dest_path, 'wt', encoding='utf-8', errors='ignore' )
				lines = ' '.join( processed )
				clean.write( lines )
				clean.flush( )
	except Exception as e:
		print( "The 'clean_files' function raised an exception:", e )

In [442]:
clean_files( src, dest )

In [152]:
a11 = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\a11.xlsx'
cfr31 = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\cfr31.xlsx'
fastbook = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\fastbook.xlsx'
redbook = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\redbook.xlsx'
ledger = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\ledger.xlsx'
instructions = '''You are the most knowledgeable Budget Analyst in the federal government who provides detailed responses based on your vast knowledge of budget legislation, and federal appropriations. Your responses to questions about federal finance are complete, transparent, and very detailed using an academic format. Your vast knowledge of and experience in Data Science makes you the best Data Analyst in the world. You are also an expert programmer who is proficient in C#, Python, S L, C++, JavaScript, and VBA. You are famous for the accuracy of your responses so you verify all your answers. This makes the quality of your code very high and it always works. Your responses are always accurate and complete! Your name is Bubba.
'''



In [None]:
system = '{"messages": [{' + f'"role": "system", "content": {instructions}' + '},{'
prologue =  system + f'"role": "user", "content": "{Q}"' + '},'
question =  f'"role": "user", "content": "{Q}"' + '},'
answer = '{' + f'"role": "assistant", "content": "{A}" ' + '}]}'
end = ']}'

In [165]:
xl_omba11 = pd.read_excel( a11, sheet_name='Training' )
names = [ 'ID', 'Item', 'Role', 'Content' ]
idx = xl_omba11.index
df_omba11 = pd.DataFrame( data=xl_omba11, columns=names, index=idx  )
df_omba11 = df_omba11.reset_index( ).set_index( 'ID' )
omb_rows = len( df_omba11  )

In [None]:
xl_cfr31 = pd.read_excel( cfr31, sheet_name='Training' )
df_cfr31 = pd.DataFrame( xl_cfr31 )
cfr_rows = len( df_cfr31 )

In [168]:
for r in range( 25 ):
	record = f' "{df_omba11.iloc[ r, 2 ]}" ; "{df_omba11.iloc[ r, 3 ]}" '
	print(  '{' + record + '},' )

{ "user" ; "What is the title and number of this OMB Circular?" },
{ "assistant" ; "The title is “Preparation, Submission, and Execution of the Budget” and its number is A–11." },
{ "user" ; "Which Executive Branch office issued this Circular, and when?" },
{ "assistant" ; "It was issued by the Executive Office of the President, Office of Management and Budget, in December 2019. " },
{ "user" ; "What is the primary subject covered by Circular A–11?" },
{ "assistant" ; " Circular A–11 provides guidance on the preparation, submission, and execution of the Federal budget. " },
{ "user" ; "Which high-level offices does the cover page explicitly name as responsible for Circular A–11?" },
{ "assistant" ; "The cover page specifically names the Executive Office of the President and the Office of Management and Budget as its issuers." },
{ "user" ; "Who issued the June 28, 2019 transmittal memorandum for Circular A–11, and what is its number?" },
{ "assistant" ; "It was issued by Russell T. Vou

In [None]:
xl_redbook = pd.read_excel( redbook, sheet_name='Training' )
df_redbook = pd.DataFrame( xl_redbook )
redbook_rows = len( df_redbook )

In [None]:
xl_ledger = pd.read_excel( ledger, sheet_name='Training' )
df_ledger = pd.DataFrame( xl_ledger )
ledger_rows = len( df_ledger )