# Reading and Cleaning Text

In [1]:
# pdf_file = "./doc/1706.03762.pdf"
pdf_file = "./doc/2005.11401.pdf"
text_file = "./doc/textfile.txt"
brian_pdf = "./doc/Brian's_Resume.pdf"

In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

In [3]:
unclean_text = extract_text_from_pdf(brian_pdf)
unclean_text

'Brian Temu\n# iq58974@umbc.edu ï in/brian-temu § github.com/iam-dante \x80 www.iam-brian.dev/\nEDUCATION\nUniversity of Maryland Baltimore County\nMay 2025\nMaster’s in Data Science\nMaryland\nUniversity of Dar es salaam\nNov 2019 – Oct 2022\nBachelor of Science in Computer Science\nDar es salaam\nSKILLS\nProgramming Languages: Python, JavaScript(TypeScript), C, C++, and SQL.\nMachine Learning: Pytorch, TensorFlow, MLX, Scikit-learn, Pandas, Numpy, Seaborn, and Matplotlib.\nAI/ML Skills: LLM fine-tuning, sentiment analysis, neural networks, and feature engineering.\nTools: ML flow, Visual Studio Code, Jupyter Notebook, Docker, Git, and Google Colab.\nCourses: Algorithms, Big Data, Database Management Systems, Machine Learning, and Artificial Intelligence.\nWORK EXPERIENCE\nInstitute of Genome Science, UMB\nMay 2024 – Aug 2024\nData Science Intern\nMaryland\n• Leveraged antiSMASH to analyze Bacteria Vaginosis gene clusters, uncovering critical biosynthetic patterns linked to\nrecurrent

In [4]:
import re
import unicodedata

def clean_text_(text):
    # text = text.replace("\n", " ")  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', str(text))  # Remove extra spaces
    return text.strip()  # Trim leading and trailing spaces

def remove_special_chars(text):
    text = re.sub(r'[^a-zA-Z0-9.,!?\'" ]', '', text)  # Keep letters, numbers, and common punctuation
    return text

def fix_hyphenation(text):
    return re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)  # Removes hyphenation across lines

def normalize_unicode(text):
    return unicodedata.normalize("NFKD", text)

def remove_headers_footers(text):
    lines = text.split("\n")
    cleaned_lines = [line for line in lines if not re.match(r'(Page \d+|Confidential|Company Name)', line)]
    return " ".join(cleaned_lines)

def normalize_text(text):
    return " ".join(text.lower().split())  # Lowercase and remove extra spaces

def full_text_cleanup(text):
    """"
    Takes in unclean text and return cleaned text by applying a series of cleaning functions.
    
    """
    text = clean_text_(text)
    text = fix_hyphenation(text)
    text = remove_special_chars(text)
    text = normalize_unicode(text)
    text = remove_headers_footers(text)
    text = normalize_text(text)
    
    return text

In [5]:
clean_text = full_text_cleanup(unclean_text)
clean_text

'brian temu iq58974umbc.edu inbriantemu github.comiamdante www.iambrian.dev education university of maryland baltimore county may 2025 masters in data science maryland university of dar es salaam nov 2019 oct 2022 bachelor of science in computer science dar es salaam skills programming languages python, javascripttypescript, c, c, and sql. machine learning pytorch, tensorflow, mlx, scikitlearn, pandas, numpy, seaborn, and matplotlib. aiml skills llm finetuning, sentiment analysis, neural networks, and feature engineering. tools ml flow, visual studio code, jupyter notebook, docker, git, and google colab. courses algorithms, big data, database management systems, machine learning, and artificial intelligence. work experience institute of genome science, umb may 2024 aug 2024 data science intern maryland leveraged antismash to analyze bacteria vaginosis gene clusters, uncovering critical biosynthetic patterns linked to recurrent bacteria vaginosis. applied statistical and bioinformatics 

In [6]:
from langchain.text_splitter import NLTKTextSplitter
def split_text_into_sentences(text):
    """Splits text into sentences using NLTKTextSplitter."""
    text_splitter = NLTKTextSplitter()
    return text_splitter.split_text(text)

sentences = split_text_into_sentences(clean_text)
print(sentences)

['brian temu iq58974umbc.edu inbriantemu github.comiamdante www.iambrian.dev education university of maryland baltimore county may 2025 masters in data science maryland university of dar es salaam nov 2019 oct 2022 bachelor of science in computer science dar es salaam skills programming languages python, javascripttypescript, c, c, and sql.\n\nmachine learning pytorch, tensorflow, mlx, scikitlearn, pandas, numpy, seaborn, and matplotlib.\n\naiml skills llm finetuning, sentiment analysis, neural networks, and feature engineering.\n\ntools ml flow, visual studio code, jupyter notebook, docker, git, and google colab.\n\ncourses algorithms, big data, database management systems, machine learning, and artificial intelligence.\n\nwork experience institute of genome science, umb may 2024 aug 2024 data science intern maryland leveraged antismash to analyze bacteria vaginosis gene clusters, uncovering critical biosynthetic patterns linked to recurrent bacteria vaginosis.\n\napplied statistical 

In [8]:
# Using Langchain to extract text 
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter

def extract_text_langchain(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    return "\n".join([doc.page_content for doc in documents])


def lang_clean_text(text):
    # text = text.replace("\n", " ").strip()  
    text = full_text_cleanup(text)
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    return text_splitter.split_text(text)

def split_text_into_sentences(text):
    """Splits text into sentences using NLTKTextSplitter."""
    text_splitter = NLTKTextSplitter()
    sentences = text_splitter.split_text(text)
    cleaned_sentences = [sentence.replace("\n", " ") for sentence in sentences]
    return cleaned_sentences

lang_text = extract_text_langchain(brian_pdf)
lang_cleaned_text = lang_clean_text(lang_text)
lang_sentences = split_text_into_sentences(lang_cleaned_text[0])

lang_sentences

['brian temu iq58974umbc.edu inbriantemu github.comiamdante www.iambrian.dev education university of maryland baltimore county may 2025 masters in data science maryland university of dar es salaam nov 2019 oct 2022 bachelor of science in computer science dar es salaam skills programming languages python, javascripttypescript, c, c, and sql.  machine learning pytorch, tensorflow, mlx, scikitlearn, pandas, numpy, seaborn, and matplotlib.  aiml skills llm finetuning, sentiment analysis, neural networks, and feature engineering.  tools ml flow, visual studio code, jupyter notebook, docker, git, and google colab.  courses algorithms, big data, database management systems, machine learning, and artificial intelligence.  work experience institute of genome science, umb may 2024 aug 2024 data science intern maryland leveraged antismash to analyze bacteria vaginosis gene clusters, uncovering critical biosynthetic patterns linked to recurrent bacteria vaginosis.  applied statistical and bioinfor

In [9]:
import chromadb
chroma_client = chromadb.Client()

sentence_chunks = lang_sentences

collection = chroma_client.get_or_create_collection(name="my_collection")
collection.add(
    documents=sentence_chunks,
    ids=[f"{i}" for i in range(len(sentence_chunks))]
)


In [14]:
results = collection.query(
    query_texts=["What are Brian Experiences after Graduating his Bachelors in Computer Science"], # Chroma will embed this for you
    n_results=2 # how many results to return
)

print(results["documents"])

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


[['brian temu iq58974umbc.edu inbriantemu github.comiamdante www.iambrian.dev education university of maryland baltimore county may 2025 masters in data science maryland university of dar es salaam nov 2019 oct 2022 bachelor of science in computer science dar es salaam skills programming languages python, javascripttypescript, c, c, and sql.  machine learning pytorch, tensorflow, mlx, scikitlearn, pandas, numpy, seaborn, and matplotlib.  aiml skills llm finetuning, sentiment analysis, neural networks, and feature engineering.  tools ml flow, visual studio code, jupyter notebook, docker, git, and google colab.  courses algorithms, big data, database management systems, machine learning, and artificial intelligence.  work experience institute of genome science, umb may 2024 aug 2024 data science intern maryland leveraged antismash to analyze bacteria vaginosis gene clusters, uncovering critical biosynthetic patterns linked to recurrent bacteria vaginosis.  applied statistical and bioinfo

# LLM Intergration 

In [11]:
import requests
import json

def ask_ollama(query, context=None):

    context = "-".join(context) if context else "No context provided."
    
    prompt = f"""
            You are an advanced Retrieval-Augmented Generation (RAG) system designed to provide highly accurate and contextually relevant responses. Use *only* the information provided in the context below to generate your answer. Do not use any prior knowledge or external sources. If the context does not contain enough information to answer the question, explicitly state: "I cannot answer this question based on the provided information."

            ## Instructions:
            - Analyze the retrieved context carefully to extract the most relevant details.
            - Ensure that your answer is comprehensive, well-structured, and directly addresses the user's question.
            - If multiple pieces of evidence exist in the context, synthesize them for a cohesive response.
            - If the context is unclear, ambiguous, or conflicting, acknowledge this uncertainty in your response.
            - Do not assume or infer facts beyond what is stated in the provided context.

            ## Context:
            {context}

            ## Question:
            {query}

            ## Answer:
            """
    
    grok_prompt = f"""
        ### Prompt for RAG System

            **Instruction:**
            You are an AI designed to answer queries using a two-step process involving context retrieval and knowledge-based answering. Here's how you should proceed:

            1. **Context Retrieval (Step 1):**
            - **Context:** {context}
            - **Query:** {query}

            First, attempt to answer the query using the provided context. Look for relevant information within the context that directly relates to the query. If you can answer the query comprehensively using only this context, do so. If you cannot:

            2. **Knowledge-Based Answer (Step 2):**
            - If the context does not provide enough information to answer the query accurately, or if the query is not adequately addressed by the context, use your pre-existing knowledge to answer the query. 
            - Be clear that you are now using your knowledge by starting your response with "Based on my knowledge:".

            **Guidelines:**
            - **Accuracy:** Prioritize accuracy. If the context does not provide a clear answer and your knowledge is uncertain or outdated, acknowledge this by saying, "I'm not certain about this, but based on my knowledge:".
            - **Completeness:** If part of the query can be answered with context but not fully, use context for what you can and supplement with knowledge.
            - **Citations:** When answering from context, if possible, reference or quote directly from the context by using quotation marks or by specifying where in the context the answer was found (e.g., "According to the context...").
            - **Admit Limitations:** If neither the context nor your knowledge can provide an answer, admit this by saying, "I do not have enough information to answer this query adequately."

            **Example Response Formats:**

            - **From Context:** "The context states that the boiling point of water at sea level is 100°C."
            - **From Knowledge:** "Based on my knowledge, the average adult human body contains approximately 60% water."
            - **Mixed:** "From the context, we learn that the Eiffel Tower was completed in 1889. Based on my knowledge, it was designed by Gustave Eiffel."
            - **Admitting Limitation:** "I do not have enough information to answer this query adequately."

            **Proceed:**
            Now, attempt to answer the query provided:

            **Query:** {query}

            Your answer should be just explain of your understanding of the question. Dont list steps or any other things. Just explain the concept. Dont say Based on my knowledge
    
    """
    
    # Ollama local API endpoint
    OLLAMA_URL = "http://localhost:11434/api/generate"

    # Define the request payload
    payload = {
        "model": "llama3.2",  # Change this to the model you have installed
        "prompt": grok_prompt,
        "stream": False  # Set to True if you want to stream responses
    }

    # Send the request
    response = requests.post(OLLAMA_URL, json=payload)

    # Parse and print response
    if response.status_code == 200:
        data = response.json()
        print(data["response"])
    else:
        print(f"Error: {response.status_code}, {response.text}")




In [13]:
query = ["What are Brian Experiences after Graduating his Bachelors in Computer Science"]
response_query = collection.query(query_texts=query, n_results=1)
context = response_query["documents"][0]

response = ask_ollama(query, context)

Based on the provided context, it appears that Brian Temu graduated with a Bachelor of Science in Computer Science from the University of Maryland, Baltimore County (UMBC) in May 2025. After graduating, Brian's work experience and skills suggest that he transitioned into a career in data science.

As a Data Science Intern at the Institute of Genome Science from May 2024 to August 2024, Brian worked on analyzing bacteria vaginosis gene clusters using antismash and applied statistical and bioinformatics methods to uncover genetic markers for targeted bacteria vaginosis research. This experience likely honed his skills in machine learning and data analysis.

Brian's role as a Machine Learning Engineer Intern at Tanzania Data Lab from July 2021 to September 2021 further showcased his expertise in collecting, cleaning, and transforming image data, ensuring top-quality training datasets that achieved optimal model performance. He also researched and evaluated machine learning algorithms, ach