# Reading and Cleaning Text

In [28]:
# pdf_file = "./doc/1706.03762.pdf"
pdf_file = "./doc/2005.11401.pdf"
text_file = "./doc/textfile.txt"
brian_pdf = "./doc/Brian's_Resume.pdf"
opt_workshop = "./doc/opt_workshop_umbc.pdf"

In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

In [29]:
unclean_text = extract_text_from_pdf(opt_workshop)
unclean_text

'OPT\nA step-by-step guide\n\nWhat is OPT?\n▪Optional Practical Training\n▪Work permission for 1 year (with option to extend 2 years for STEM degrees)\n▪Purpose is to gain experience in your field\n▪One year of OPT per degree level\n▪Must be progressively higher (i.e. bachelor → master → doctoral)\n\nOPT Application Process\nSubmit through \nUSCIS website\n\nDon’t I need a job to apply for OPT?\nNO!\n▪Why not??\n▪USCIS can take more than three months to review your OPT application\n▪It would be WAY TOO LATE to apply if you waited to find a job first\n\nWhen can I apply?\n▪Apply for OPT during your last \nsemester at UMBC\n▪The earliest you can apply for OPT \nis 90 days before your I-20 end date\n▪The latest you can apply for OPT is \n60 days after your I-20 end date\nSpring grads\nFall grads\nEarliest date\nFeb 25\nSept 25\nI-20 end date\nMay 25\nDec 23\nLatest date\nJuly 24\nFeb 20\nApproximate OPT timeline by grad term\n\nAnother way to think about it\nI-20 end date\n90 days\n60-day

In [30]:
import re
import unicodedata

def clean_text_(text):
    # text = text.replace("\n", " ")  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', str(text))  # Remove extra spaces
    return text.strip()  # Trim leading and trailing spaces

def remove_special_chars(text):
    text = re.sub(r'[^a-zA-Z0-9.,!?\'" ]', '', text)  # Keep letters, numbers, and common punctuation
    return text

def fix_hyphenation(text):
    return re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)  # Removes hyphenation across lines

def normalize_unicode(text):
    return unicodedata.normalize("NFKD", text)

def remove_headers_footers(text):
    lines = text.split("\n")
    cleaned_lines = [line for line in lines if not re.match(r'(Page \d+|Confidential|Company Name)', line)]
    return " ".join(cleaned_lines)

def normalize_text(text):
    return " ".join(text.lower().split())  # Lowercase and remove extra spaces

def full_text_cleanup(text):
    """"
    Takes in unclean text and return cleaned text by applying a series of cleaning functions.
    
    """
    text = clean_text_(text)
    text = fix_hyphenation(text)
    text = remove_special_chars(text)
    text = normalize_unicode(text)
    text = remove_headers_footers(text)
    text = normalize_text(text)
    
    return text

In [31]:
clean_text = full_text_cleanup(unclean_text)
clean_text

'opt a stepbystep guide what is opt? optional practical training work permission for 1 year with option to extend 2 years for stem degrees purpose is to gain experience in your field one year of opt per degree level must be progressively higher i.e. bachelor master doctoral opt application process submit through uscis website dont i need a job to apply for opt? no! why not?? uscis can take more than three months to review your opt application it would be way too late to apply if you waited to find a job first when can i apply? apply for opt during your last semester at umbc the earliest you can apply for opt is 90 days before your i20 end date the latest you can apply for opt is 60 days after your i20 end date spring grads fall grads earliest date feb 25 sept 25 i20 end date may 25 dec 23 latest date july 24 feb 20 approximate opt timeline by grad term another way to think about it i20 end date 90 days 60day grace period earliest date you can submit application latest date you can subm

In [32]:
from langchain.text_splitter import NLTKTextSplitter
def split_text_into_sentences(text):
    """Splits text into sentences using NLTKTextSplitter."""
    text_splitter = NLTKTextSplitter()
    return text_splitter.split_text(text)

sentences = split_text_into_sentences(clean_text)
print(sentences)

['opt a stepbystep guide what is opt?\n\noptional practical training work permission for 1 year with option to extend 2 years for stem degrees purpose is to gain experience in your field one year of opt per degree level must be progressively higher i.e.\n\nbachelor master doctoral opt application process submit through uscis website dont i need a job to apply for opt?\n\nno!\n\nwhy not??\n\nuscis can take more than three months to review your opt application it would be way too late to apply if you waited to find a job first when can i apply?\n\napply for opt during your last semester at umbc the earliest you can apply for opt is 90 days before your i20 end date the latest you can apply for opt is 60 days after your i20 end date spring grads fall grads earliest date feb 25 sept 25 i20 end date may 25 dec 23 latest date july 24 feb 20 approximate opt timeline by grad term another way to think about it i20 end date 90 days 60day grace period earliest date you can submit application lates

In [35]:
# Using Langchain to extract text 
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter

def extract_text_langchain(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    return "\n".join([doc.page_content for doc in documents])


def lang_clean_text(text):
    # text = text.replace("\n", " ").strip()  
    text = full_text_cleanup(text)
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    return text_splitter.split_text(text)

def split_text_into_sentences(text):
    """Splits text into sentences using NLTKTextSplitter."""
    text_splitter = NLTKTextSplitter()
    sentences = text_splitter.split_text(text)
    cleaned_sentences = [sentence.replace("\n", " ") for sentence in sentences]
    return cleaned_sentences

lang_text = extract_text_langchain(opt_workshop)
lang_cleaned_text = lang_clean_text(lang_text)
lang_sentences = split_text_into_sentences(lang_cleaned_text[0])

lang_sentences

['opt a stepbystep guide what is opt?  optional practical training work permission for 1 year with option to extend 2 years for stem degrees purpose is to gain experience in your field one year of opt per degree level must be progressively higher i.e.  bachelor master doctoral opt application process submit through uscis website dont i need a job to apply for opt?  no!  why not??  uscis can take more than three months to review your opt application it would be way too late to apply if you waited to find a job first when can i apply?  apply for opt during your last semester at umbc the earliest you can apply for opt is 90 days before your i20 end date the latest you can apply for opt is 60 days after your i20 end date spring grads fall grads earliest date feb 25 sept 25 i20 end date may 25 dec 23 latest date july 24 feb 20 approximate opt timeline by grad term another way to think about it i20 end date 90 days 60day grace period earliest date you can submit application latest date you c

In [36]:
import chromadb
chroma_client = chromadb.Client()

sentence_chunks = lang_sentences

collection = chroma_client.get_or_create_collection(name="my_collection")
collection.add(
    documents=sentence_chunks,
    ids=[f"{i}" for i in range(len(sentence_chunks))]
)


Insert of existing embedding ID: 0
Add of existing embedding ID: 0


In [37]:
results = collection.query(
    query_texts=["How to apply opt at umbc"], # Chroma will embed this for you
    n_results=2 # how many results to return
)

print(results["documents"])

[['you need to think about this!  it is extremely risky to be in the us without health insurance.  see about options your employer might have, or consider the healthcare marketplaces established through the affordable care act obamacare continuing education you are free to take classes during opt but you may not begin a new degree program, or your opt will automatically terminate.  register for courses as a non degreeseeking or visiting student degreeseeking programs would require you to transfer your sevis record or change your education level, which automatically ends your opt approval got it.  lets do this.  ready to complete the opt application?  optional practical training application visit the isss portal isssportal.umbc.edu click the button on your user account, or from the isss portal homepage, find the opt application under f1 students employment optional practical training opt optional practical training application you must complete all sections of the application before cli

# LLM Intergration 

In [38]:
import requests
import json

def ask_ollama(query, context=None):

    context = "-".join(context) if context else "No context provided."
    
    prompt = f"""
            You are an advanced Retrieval-Augmented Generation (RAG) system designed to provide highly accurate and contextually relevant responses. Use *only* the information provided in the context below to generate your answer. Do not use any prior knowledge or external sources. If the context does not contain enough information to answer the question, explicitly state: "I cannot answer this question based on the provided information."

            ## Instructions:
            - Analyze the retrieved context carefully to extract the most relevant details.
            - Ensure that your answer is comprehensive, well-structured, and directly addresses the user's question.
            - If multiple pieces of evidence exist in the context, synthesize them for a cohesive response.
            - If the context is unclear, ambiguous, or conflicting, acknowledge this uncertainty in your response.
            - Do not assume or infer facts beyond what is stated in the provided context.

            ## Context:
            {context}

            ## Question:
            {query}

            ## Answer:
            """
    
    grok_prompt = f"""
        ### Prompt for RAG System

            **Instruction:**
            You are an AI designed to answer queries using a two-step process involving context retrieval and knowledge-based answering. Here's how you should proceed:

            1. **Context Retrieval (Step 1):**
            - **Context:** {context}
            - **Query:** {query}

            First, attempt to answer the query using the provided context. Look for relevant information within the context that directly relates to the query. If you can answer the query comprehensively using only this context, do so. If you cannot:

            2. **Knowledge-Based Answer (Step 2):**
            - If the context does not provide enough information to answer the query accurately, or if the query is not adequately addressed by the context, use your pre-existing knowledge to answer the query. 
            - Be clear that you are now using your knowledge by starting your response with "Based on my knowledge:".

            **Guidelines:**
            - **Accuracy:** Prioritize accuracy. If the context does not provide a clear answer and your knowledge is uncertain or outdated, acknowledge this by saying, "I'm not certain about this, but based on my knowledge:".
            - **Completeness:** If part of the query can be answered with context but not fully, use context for what you can and supplement with knowledge.
            - **Citations:** When answering from context, if possible, reference or quote directly from the context by using quotation marks or by specifying where in the context the answer was found (e.g., "According to the context...").
            - **Admit Limitations:** If neither the context nor your knowledge can provide an answer, admit this by saying, "I do not have enough information to answer this query adequately."

            **Example Response Formats:**

            - **From Context:** "The context states that the boiling point of water at sea level is 100°C."
            - **From Knowledge:** "Based on my knowledge, the average adult human body contains approximately 60% water."
            - **Mixed:** "From the context, we learn that the Eiffel Tower was completed in 1889. Based on my knowledge, it was designed by Gustave Eiffel."
            - **Admitting Limitation:** "I do not have enough information to answer this query adequately."

            **Proceed:**
            Now, attempt to answer the query provided:

            **Query:** {query}

            Your answer should be just explain of your understanding of the question. Dont list steps or any other things. Just explain the concept. Dont say Based on my knowledge or Based on the provided context, on your answer
            Just answer the question directly and in detail simple way
            DONT MENTION ABOUT THE CONTEXT OR THE QUESTION IN YOUR ANSWER
    
    """
    
    # Ollama local API endpoint
    OLLAMA_URL = "http://localhost:11434/api/generate"

    # Define the request payload
    payload = {
        "model": "llama3.2",  # Change this to the model you have installed
        "prompt": grok_prompt,
        "stream": False # Set to True if you want to stream responses
    }

    # Send the request
    response = requests.post(OLLAMA_URL, json=payload)

    # Parse and print response
    if response.status_code == 200:
        data = response.json()
        print(data["response"])
    else:
        print(f"Error: {response.status_code}, {response.text}")




In [42]:
query = ["What start date should I select as my starting date"]
response_query = collection.query(query_texts=query, n_results=1)
context = response_query["documents"][0]

response = ask_ollama(query, context)

You should select an opt start date that aligns with when you plan to begin your Optional Practical Training (OPT) period. This date should ideally be after your I-20 end date but before your OPT start date, allowing for a smooth transition into employment or further education without any visa issues.
