# NLTK
___


#### 📦 One-Time Setup (NLTK Resources)

##### Load Dependencies

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
import html
import ipywidgets as widgets, IPython, platform, ipywidgets, jupyterlab
from importlib import reload
import io
from isort.format import remove_whitespace
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import os
import pickle
from pathlib import Path
import re
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string
import time
from textblob import Word, TextBlob
import unicodedata

In [3]:
import processing as pc
reload( pc )
from processing import Text


In [4]:
fp = r'C:\Users\terry\Desktop\Test\Text\Balanced Budget and Emergency Deficit Control Act of 1985.txt'
src = r'C:/Users/terry/Desktop/Test/Cleaned/'
dst = r'C:/Users/terry/Desktop/Test/Chunked/'
tx = Text( )

In [5]:
text = tx.load_text( fp )
collapsed = tx.collapse_whitespace( text )
compressed = tx.compress_whitespace( collapsed )
normalized = tx.normalize_text( compressed )
encoded = tx.remove_encodings( normalized )
special = tx.remove_special( encoded )
cleaned = tx.remove_fragments( special )
recompress = tx.compress_whitespace( cleaned )
dataframe = tx.split_sentences( recompress )

In [5]:
cleaned = tx.clean_file( fp )
sentences = tx.split_sentences( cleaned )
items = ' '.join( sentences )
tokens = items.split( None )
frequency = tx.calculate_frequency_distribution( tokens )
frequency


Unnamed: 0_level_0,Word,Frequency
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,balanced,14
1,budget,369
2,emergency,20
3,deficit,79
4,control,38
...,...,...
1345,concurred,2
1346,others,1
1347,ments,1
1348,insisted,1


### 1.  Load File

In [None]:
# === Load Raw Text ===
file_path = '<url to file>'
_rawtext = ''


def load_text( file_path ):
	with open( file_path, 'r', encoding='utf-8' ) as f:
		_rawtext = f.read( )
		return _rawtext


### 🧮 1. Bag of Words (BoW) using CountVectorizer

In [None]:
	corpus = [ 'Bro loves clean code.', 'Code is life.' ]
	vectorizer = CountVectorizer( )
	X = vectorizer.fit_transform( corpus )

	print( vectorizer.get_feature_names_out( ) )
	print( X.toarray( ) )


### 📊 2. TF-IDF using TfidfVectorizer

In [None]:
	corpus = [ 'Bro writes awesome code.', 'Code must be clean and clear.' ]
	vectorizer = TfidfVectorizer( )
	X = vectorizer.fit_transform( corpus )

	print( vectorizer.get_feature_names_out( ) )
	print( X.toarray( ) )


### 🧠 3. Word2Vec using gensim

In [None]:
	sentences = [ [ 'bro', 'loves', 'python' ], [ 'clean', 'code', 'rocks' ] ]
	model = Word2Vec( sentences, vector_size=100, window=5, min_count=1, workers=4 )

	# VectorStore for the word 'bro'
	vector = model.wv[ 'bro' ]
	print( vector )


### 🌍 4. GloVe using gensim (with pre-trained vectors)


In [None]:
	# Load GloVe vec (convert .txt to .word2vec format beforehand if needed)
	glove_file = r'C:\Users\terry\source\llm\glove\glove.6B.100d.txt'
	model = KeyedVectors.load_word2vec_format( glove_file, unicode_errors='ignore' )

	# VectorStore for the word 'code'
	vector = model[ 'code' ]
	print( vector )


### 🤖 5. BERT / Transformer-based Embeddings using transformers + torch


In [None]:
	tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' )
	model = BertModel.from_pretrained( 'bert-base-uncased' )

	sentence = "Bro's code always works."
	inputs = tokenizer( sentence, return_tensors='pt' )
	outputs = model( **inputs )

	# Get the vector for [CLS] token (sentence embedding)
	sentence_embedding = outputs.last_hidden_state[ :, 0, : ]
	print( sentence_embedding.shape )


#### Clean Document

In [None]:
def clean_text( text: str ) -> str:
	text = text.replace( '\r\n', '\n' ).replace( '\r', '\n' )
	text = re.sub( r'\n\s*\d+\s*\n', '\n', text )
	text = re.sub( r'(\w+)-\n(\w+)', r'\1\2', text )
	text = re.sub( r'(?<!\n)\n(?![\n])', ' ', text )
	text = re.sub( r'\s+', ' ', text )
	return text.strip( )




#  OpenAI Embedding
___

##### API key

In [None]:
# Create client
client = OpenAI( )
client.api_key = os.getenv( 'OPENAI_API_KEY' )

#### 1. Define embedding function

In [None]:
def embed_texts( texts, model='text-embedding-3-small', batch_size=10, sleep=1 ):
	embeddings = [ ]
	for i in range( 0, len( texts ), batch_size ):
		batch = texts[ i:i + batch_size ]
		try:
			response = openai.embeddings.create( input=batch, model=model )
			batch_embeddings = [ e.embedding for e in response.data ]
			embeddings.extend( batch_embeddings )
		except Exception as e:
			print( f'Error at batch {i}: {e}' )
			# Retry or sleep to avoid rate limits
			time.sleep( sleep )
			continue

	return embeddings


#### 2. Embed chunks

In [None]:
# 2. Embed chunks
embeddings = embed_texts( chunks )

#### 3.  Create DataFrame

In [None]:
# 3. Create DataFrame
df_embeddings = pd.DataFrame( { chunks, embeddings } )


#### 3. Save


In [None]:
# 3. Save
df_embeddings.to_parquet( 'public_law_118_32_embeddings.parquet', index=False )


#### 4. Preview

In [None]:
# 4. Preview
df_embeddings.head( 2 )


### 3. Generate Embeddings
- Use a language model (e.g., OpenAI, HuggingFace) to create vector representations of each chunk_words.

In [None]:
model = SentenceTransformer( 'all-MiniLM-L6-v2' )
embeddings = model.encode( chunks, show_progress_bar=True )


### 4. Create SQLite Database

- Design a table that links text chunks to their embeddings.

In [None]:
conn = sqlite3.connect( 'vectors.target_values' )
cursor = conn.cursor( )
sql_create = '''
CREATE TABLE IF NOT EXISTS Law_Embeddings
(
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Chunk_Tokens TEXT NOT NULL,
    Embedding BLOB NOT NULL
)
'''

cursor.execute( sql_create )

for chunk, vector in zip( chunks, embeddings ):
	blob = pickle.dumps( vector )
	cursor.execute( 'INSERT INTO Law_Embeddings ( Chunk_Tokens, Embedding ) VALUES (?, ?)',
		(chunk, blob) )

conn.commit( )
conn.close( )


###  Retrieval (Vector Search in SQLite)

- You can perform semantic search by encoding a query and comparing via cosine similarity


In [None]:
def cosine_similarity( a, b ):
	return np.dot( a, b ) / (np.linalg.norm( a ) * np.linalg.norm( b ))

In [None]:
query = 'Appropriations for Department of Defense'
query_vec = model.encode( [ query ] )[ 0 ]

conn = sqlite3.connect( 'vectors.target_values' )
cursor = conn.cursor( )
cursor.execute( 'SELECT Id, Chunk_Tokens, Embedding FROM Law_Embeddings' )

results = [ ]
for row in cursor.fetchall( ):
	chunk_id, chunk_text, blob = row
	stored_vec = pickle.loads( blob )
	sim = cosine_similarity( query_vec, stored_vec )
	results.append( (sim, chunk_text) )

# Sort and get top N
top_matches = sorted( results, key=lambda x: x[ 0 ], reverse=True )[ :5 ]


#   Embedding-Pipeline Script
___

##### Load Dependencies



In [None]:
import re
import sqlite3
import numpy as np
from tqdm import tqdm
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter


#### Configuration

In [None]:
# Define paths
TEXT_FILE = 'PublicLaw_118-42.txt'
DB_FILE = 'law_embeddings.target_values'
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


#### Load and Clean Raw Text

In [None]:
def load_and_clean_text( filepath ):
	with open( filepath, 'r', encoding='utf-8', errors='ignore' ) as file:
		raw_text = file.read( )

	# Basic normalization
	form_feeds = re.sub( r'\f+', ' ', raw_text )
	empty_lines = re.sub( r'\n+', ' ', form_feeds )
	extra_spaces = re.sub( r'\s{2,}', ' ', empty_lines )
	return extra_spaces



#### Generate Embeddings

In [None]:
def get_embedding( text, model=OPENAI_MODEL ):
	response = openai.Embedding.generate_text( input=text, model=model )
	return response[ 'target_values'[ 0 ][ 'embedding' ] ]


def embed_chunks( chunks ):
	embeddings = [ ]
	for chunk in tqdm( chunks, desc='EmbeddingRequest chunks via OpenAI' ):
		try:
			embedding = get_embedding( chunk )
			embeddings.append( embedding )
		except Exception as e:
			print( f'Error embedding chunk_words: {e}' )
			embeddings.append( [ 0.0 ] * 1536 )  # Placeholder for failed requests
	return embeddings


##### Create SQLite DB



In [None]:
def create_store( chunks, embeddings, db_path ):
	conn = sqlite3.connect( db_path )
	cursor = conn.cursor( )
	sql_create = '''
    CREATE TABLE IF NOT EXISTS Law_Embeddings
    (
        Id INTEGER PRIMARY KEY AUTOINCREMENT,
        Chunk_Tokens TEXT NOT NULL,
        Embedding BLOB NOT NULL
    )
    '''

	cursor.execute( sql_create )
	for chunk, vector in zip( chunks, embeddings ):
		blob = pickle.dumps( vector )
		sql_insert = 'INSERT INTO Law_Embeddings ( Chunk_Tokens, Embedding ) VALUES ( ?, ? )'
		cursor.execute( sql_insert, (chunk, blob) )

	conn.commit( )
	conn.close( )


#### Script

In [None]:
# === MAIN ===
def main( ):
	print( 'Step 1: Load and clean documents' )
	cleaned_text = load_and_clean_text( TEXT_FILE )

	print( 'Step 2: Chunking documents' )
	chunks = chunk_text( cleaned_text )
	print( f'Total chunks: {len( chunks )}' )

	print( 'Step 3: EmbeddingRequest with OpenAI API' )
	embeddings = embed_chunks( chunks )

	print( 'Step 4: Saving to SQLite' )
	create_store( chunks, embeddings, DB_FILE )

	print( f'Pipeline complete. Embeddings stored in: {DB_FILE}' )


if __name__ == '__main__':
	main( )

#### Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd


# === 1. Load Model ===
# You can try other models like 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', or 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer( 'all-MiniLM-L6-v2' )


# === 2. Embed Chunks ===
def embed_with_sentence_transformers( texts, model ):
	return model.encode( texts, show_progress_bar=True, convert_to_numpy=True )


local_embeddings = embed_with_sentence_transformers( chunks, model )

# === 3. Save in a DataFrame ===
df_local = pd.DataFrame(
{
	'chunk_words': chunks,
	'embedding': list( local_embeddings )  # numpy arrays to a list for DataFrame compatibility
} )

# === 4. Save to Disk ===
df_local.to_parquet( 'public_law_118_32_local_embeddings.parquet', index=False )

# === 5. Preview ===
df_local.head( 2 )



## Fine-Tuning

In [None]:
import json
import openai
import os
import pandas as pd
from pprint import pprint


In [None]:

client = openai.OpenAI(
	api_key=os.environ.get( 'OPENAI_API_KEY' ),
	organization='<org id>',
	project='<project id>',
)

In [None]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned_lines to only contain documents from www.cookbooks.com
recipe_df = pd.read_csv( r'C:\Users\terry\Desktop\cookbook_recipes_nlg_10k.csv' )
recipe_df.head( )

In [None]:
system_message = 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'

def create_user_message( row ):
	return f'Title: {row[ 'title' ]}\n\nIngredients: {row[ 'ingredients' ]}\n\nGeneric ingredients: '


def prepare_example_conversation( row ):
	return \
	{
		'messages': [
		{
			'role': 'system',
			'content': system_message
		},
		{
			'role': 'user',
			'content': create_user_message( row )
		},
		{
			'role': 'assistant',
			'content': row[ 'NER' ]
		}, ]
	}



In [None]:
# use the first 100 rows of the dataset for training
training_df = recipe_df.loc[ 0:100 ]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply( prepare_example_conversation, axis=1 ).tolist( )

for example in training_data[ :5 ]:
	print( example )

In [None]:
validation_df = recipe_df.loc[ 101:200 ]
validation_data = validation_df.apply(
	prepare_example_conversation, axis=1 ).tolist( )

In [None]:
def write_jsonl( data: List[ Dict ], filename: str ) -> None:
	with open( filename, 'w' ) as out:
		for kvp in data:
			jout = json.dumps( kvp ) + '\n'
			out.write( jout )

In [None]:


validation_file_name = 'tmp_recipe_finetune_validation.jsonl'
write_jsonl( validation_data, validation_file_name )

In [None]:
def upload_file( file_name: str, purpose: str ) -> str:
	with open( file_name, 'rb' ) as file_fd:
		response = client.files.create( file=file_fd, purpose=purpose )
	return response.id

In [None]:
MODEL = 'openai-4o-mini-2024-07-18'

response = client.fine_tuning.jobs.create(
	training_file=training_file_id,
	validation_file=validation_file_id,
	model=MODEL,
	suffix='recipe-ner',
)

job_id = response.id

# Text Cleaning Pipeline

## Preprocessing

In [None]:
for i in range( 10 ):
	print( lines[ i ] )

In [None]:
new = r'C:\Users\terry\Desktop\Text\Chunked'  + '\\' + filename
folder = open( new, 'wt+' )
processed = [ ]
for i, c in enumerate( lines ):
	part = ' '.join( c )
	line = '{ ' + f'"{i}"' + ' : ' + '"' + part + '"' + ' },' + '\r'
	processed.append( line )

for line in processed:
	folder.write( line )

folder.close( )

## Clean Files

In [None]:
def clean_files( src: str, dest: str ) -> None:
	try:
		if src is None:
			raise Exception( 'The argument "src" is required.' )
		elif dest is None:
			raise Exception( 'The argument "dest" is required.' )
		else:
			source = src
			destination = dest
			files = os.listdir( source )
			for f in files:
				processed = [ ]
				filename = os.path.basename( f )
				source_path = source + '\\' + filename
				text = open( source_path, 'r', encoding='utf-8', errors='ignore' ).read( )
				sentences = split_sentences( text )
				for s in sentences:
					if s != " ":
						lower = s.lower( )
						special = remove_special( lower )
						space = clean_space( special )
						processed.append( space )

				dest_path = destination + '\\' + filename
				clean = open( dest_path, 'wt', encoding='utf-8', errors='ignore' )
				lines = ' '.join( processed )
				clean.write( lines )
				clean.flush( )
	except Exception as e:
		print( "The 'clean_files' function raised an exception:", e )

## Clean Text Files

In [None]:
def clean_text_files( src: str, dest: str ) -> None:
	try:
		if src is None:
			raise Exception( 'The argument "src" is required.' )
		elif dest is None:
			raise Exception( 'The argument "dest" is required.' )
		else:
			source = src
			destination = dest
			files = os.listdir( source )
			keepers = [ '$', 'in', '(', ')', '', 'the', '. ', ': ', '; ', 'and', 'but', 'be', 'was', 'what', 'for' ]
			for f in files:
				processed = [ ]
				filename = os.path.basename( f )
				source_path = source + '\\' + filename
				text = open( source_path, 'r', encoding='utf-8', errors='ignore' ).read( )
				collapse = text.replace( '\n', ' ' )
				normal = normalize( collapse )
				sentences = normal.splitlines( )
				for s in sentences:
					if s != " " or s in keepers:
						lower = s.lower( )
						special = remove_special( lower )
						processed.append( special )

				dest_path = destination + '\\' + filename
				clean = open( dest_path, 'wt', encoding='utf-8', errors='ignore' )
				lines = ' '.join( processed )
				final = clean_text( lines )
				clean.write( final )
				clean.flush( )
	except Exception as e:
		print( "The 'clean_files' function raised an exception:", e )



## Chunk Files

In [None]:
def chunk_files( src: str, dest: str ) -> None:
	try:
		if src is None:
			raise Exception( 'The argument "src" is required.' )
		elif dest is None:
			raise Exception( 'The argument "dest" is required.' )
		else:
			source = src
			destination = dest
			files = os.listdir( source )
			for f in files:
				processed = [ ]
				filename = os.path.basename( f )
				name = filename.replace( '.txt', '.jsonl' )
				source_path = source + '\\' + filename
				text = open( source_path, 'r', encoding='utf-8', errors='ignore' ).read( )
				sentences = split_sentences( text )
				fragments = remove_fragments( sentences )
				chunks = chunk_pages( fragments )
				for i, c in enumerate( chunks ):
					words = ''.join( c ).strip( )
					line = '{' + f'"Line-{i}"' + ' : "' + words + '"},\r'
					processed.append( line )

				dest_path = destination + '\\' + name
				clean = open( dest_path, 'wt', encoding='utf-8', errors='ignore' )
				for p in processed:
					clean.write( p )
				clean.flush( )
	except Exception as e:
		print( "The 'chunk_files' function raised an exception:", e )


In [None]:
chunk_files( src, dest )

# Fine-Tuning Pipeline

#### Datasets

In [None]:
# Define Datasets
omb = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\omb.xlsx'
cfr31 = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\cfr31.xlsx'
fastbook = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\fastbook.xlsx'
redbook = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\redbook.xlsx'
ledger = r'C:\Users\terry\Desktop\AI\fine-tuning\datasets\ledger.xlsx'

#### System Instructions

In [None]:
# Set System Instructions
instructions = '''You are the most knowledgeable Budget Analyst in the federal government who provides detailed responses based on your vast knowledge of budget legislation, and federal appropriations. Your responses to questions about federal finance are complete, transparent, and very detailed using an academic format. Your vast knowledge of and experience in Data Science makes you the best Data Analyst in the world. You are also an expert programmer who is proficient in C#, Python, S L, C++, JavaScript, and VBA. You are famous for the accuracy of your responses so you verify all your answers. This makes the quality of your code very high and it always works. Your responses are always accurate and complete! Your name is Bubba.
'''

#### Messages


In [None]:
# Set Messages
system = '{"messages":[{' + f'"role":"system", "content":"{instructions}"' + '},{'
initial = system + f' "role":"user", "content":"{Q}"' + '},{'
question = '{"messages":[{' + f'"role": "user",  "content":"{Q}"' + '},{'
answer = f' "role":"assistant", "content":"{A}"' + '}]}'


In [None]:
# Print JSON

for r in range( 25 ):
	question = '{"messages":[{' + f'"role":"user", "content":"{df_ledger.iloc[ r, 2 ]}"' + '},{'
	answer = f'"role":"assistant", "content": "{df_ledger.iloc[ r, 3 ]}" ' + '}]}'
	record = question + answer
	print( record  )

#### A-11 Data

In [None]:
# Define A-11 Data
xl_a11 = pd.read_excel( omb, sheet_name='Training' )
names = [ 'ID', 'Item', 'Role', 'Content' ]
a11_idx = xl_a11.index
df_a11 = pd.DataFrame( data=xl_a11, columns=names  )
df_a11 = df_a11.reset_index( ).set_index( 'ID' )
df_a11 = df_a11.drop( columns=[ 'index' ]  )
omb_rows = len( df_a11  )
omb_rows

In [None]:
# View Dataframe
df_a11

In [None]:
# Print JSON
for r in range( len( df_a11 )):
	row = df_a11.iloc[ r, : ]
	if row[ 'Item' ] == 'Q':
		question = '{"messages":[{' + f'"role":"{row[ 'Role' ]}", "content":"{row[ 'Content' ]}"' + '},{'
	elif  row[ 'Item' ] == 'A':
		answer = f'"role":"{row[ 'Role' ]}", "content": "{row[ 'Content' ]}"' + '}]}'
		record = question + answer
	print( record )

#### CFR Data

In [None]:
# Define CFR Data
xl_cfr31 = pd.read_excel( cfr31, sheet_name='Training' )
names = [ 'ID', 'Item', 'Role', 'Content' ]
cfr_index = xl_cfr31.index
df_cfr31 = pd.DataFrame( data=xl_cfr31, columns=names  )
df_cfr31 = df_cfr31.reset_index( ).set_index( 'ID' )
df_cfr31 = df_cfr31.drop( columns=[ 'index' ] )
cfr_rows = len( df_cfr31 )
cfr_rows

In [None]:
# View Dataframe
df_cfr31

In [None]:
# Print JSON
for r in range( len( df_cfr31 ) ):
	if df_cfr31.iloc[ r, 1 ].startswith( 'u' ):
		question = '{"messages":[{' + f'"role":"{df_cfr31.iloc[ r, 1 ]}", "content":"{df_cfr31.iloc[ r, 2 ]}"' + '},{'
	elif df_cfr31.iloc[ r, 1 ].startswith( 'a' ):
		answer = f'"role":"{df_cfr31.iloc[ r, 1 ]}", "content": "{df_cfr31.iloc[ r, 2 ]}"' + '}]}'
	record = question + answer
	print( record  )



#### Redbook Data

In [None]:
# Define Redbook Data
xl_redbook = pd.read_excel( redbook, sheet_name='Training' )
names = [ 'ID', 'Item', 'Role', 'Content' ]
red_inx = xl_redbook.index
df_redbook = pd.DataFrame( data=xl_redbook, columns=names )
df_redbook = df_redbook.reset_index( ).set_index( 'ID' )
df_redbook = df_redbook.drop( columns=[ 'index' ] )
red_rows = len( df_redbook )
red_rows

In [None]:
# View Dataframe
df_redbook

In [None]:
# Print JSON
for r in range( len( df_redbook ) ):
	if df_redbook.iloc[ r, 1 ].startswith( 'u' ):
		question = '{"messages":[{' + f'"role":"{df_redbook.iloc[ r, 1 ]}", "content":"{df_redbook.iloc[ r, 2 ]}"' + '},{'
	elif df_redbook.iloc[ r, 1 ].startswith( 'a' ):
		answer = f'"role":"{df_redbook.iloc[ r, 1 ]}", "content": "{df_redbook.iloc[ r, 2 ]}" ' + '}]}'
	record = question + answer
	print( record  )

#### Ledger Data

In [None]:
# Define Ledger Data
xl_ledger = pd.read_excel( ledger, sheet_name='Training' )
names = [ 'ID', 'Item', 'Role', 'Content' ]
ldgr_inx = xl_ledger.index
df_ledger = pd.DataFrame( data=xl_ledger, columns=names, index=ldgr_inx  )
df_ledger = df_ledger.reset_index(  ).set_index( 'ID' )
df_ledger = df_ledger.drop( columns=[ 'index' ] )
ledger_rows = len( df_ledger )
ledger_rows

In [None]:
# View Dataframe
df_ledger

In [None]:
for i in df_ledger.columns:
	print( i )

In [None]:
# Print JSON
for r in range( len( df_ledger ) ):
	if df_ledger.iloc[ r, 1 ].startswith( 'u' ):
		question = '{"messages":[{' + f'"role":"{df_ledger.iloc[ r, 1 ]}", "content":"{df_ledger.iloc[ r, 2 ]}"' + '},{'
	elif df_cfr31.iloc[ r, 1 ].startswith( 'a' ):
		answer = f'"role":"{df_ledger.iloc[ r, 1 ]}", "content": "{df_ledger.iloc[ r, 2 ]}" ' + '}]}'
	record = question + answer
	print( record  )