In [49]:
import os
from dotenv import load_dotenv
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import MessagesPlaceholder


In [50]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

print("API key loaded")

API key loaded


#### Documents collections

In [51]:
documents = []

for pdf_path in glob.glob("documents/*.pdf"):  # adjust folder path
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    documents.extend(docs)

print(f"Loaded {len(documents)} pages from PDFs")


Loaded 7 pages from PDFs


#### Text Splitters

In [52]:
# Create splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=len
)

# Split documents
chunks = text_splitter.split_documents(documents)

print(f"Split {len(documents)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i+1}: {chunk.page_content}")

Split 7 documents into 33 chunks

Chunk 1: AI & ML Projects
AI & ML Projects
AI & ML Projects (Combined)
AI & ML Projects (updated)
AI & ML Projects (Updated)
Project 1: World Population Forecast
- Objective: Forecast world population trends using demographic and
socio-economic data.

Chunk 2: - Methods: Time series modeling, ARIMA, Prophet, LSTM/Transformer sequence
models.
- Data sources: UN population estimates, World Bank indicators.
Project 2: Predicting Food Prices in Nigeria
- Objective: Build models to predict food price movements in Nigerian markets.

Chunk 3: - Methods: Regression models, tree-based models (XGBoost/LightGBM), temporal
models.
- Features: Supply indicators, weather, inflation, transport costs, seasonality.
Project 3: Indian Credit Card / Car Price Predicting

Chunk 4: - Objective: Predict credit card default risk and car price/value estimation in Indian
markets.
- Methods: Classification for credit risk; regression for car price prediction using
features like 

#### Embeddings

In [53]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=api_key
)

# Test embedding
test_embedding = embeddings.embed_query("What is RAG?")
print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}")

Embedding dimension: 1536
First 5 values: [0.0006281227106228471, 0.02569717727601528, 0.007161187008023262, 0.03336399793624878, -0.031968604773283005]


#### Vector Store
# Create vector store from documents

In [54]:
vectorstore = Chroma.from_documents(
    chunks,
    embeddings,
    collection_name="my_info_collection",
    persist_directory="./chroma_db"
)

In [55]:
# Test retriver
query = "Technical skills"

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

results = retriever.invoke(query)
results


[Document(metadata={'moddate': '2025-12-14T18:59:52+01:00', 'keywords': '', 'title': '(anonymous)', 'page_label': '1', 'trapped': '/False', 'total_pages': 1, 'creator': '(unspecified)', 'subject': '(unspecified)', 'source': 'documents\\Professional Resume.pdf', 'page': 0, 'author': '(anonymous)', 'creationdate': '2025-12-14T18:59:52+01:00', 'producer': 'ReportLab PDF Library - (opensource)'}, page_content='natural language processing, and data-driven decision-making. Experienced in designing,\ntraining, and deploying machine learning models.\nEducation\nM.Sc. Artificial Intelligence; B.Sc. Applied Mathematics\nTechnical Skills\n- Programming: Python, SQL, JavaScript'),
 Document(metadata={'trapped': '/False', 'source': 'documents\\Professional Resume.pdf', 'page': 0, 'creationdate': '2025-12-14T18:59:52+01:00', 'subject': '(unspecified)', 'author': '(anonymous)', 'moddate': '2025-12-14T18:59:52+01:00', 'total_pages': 1, 'producer': 'ReportLab PDF Library - (opensource)', 'keywords': ''

#### Conversational Rag

In [56]:
# Create LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=api_key
)

# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2}
)

# prompt
prompt = ChatPromptTemplate.from_template("""
You are an AI assistant answering questions about Kehinde Akindele using the provided documents.

Use ONLY the context below to answer the question.
If the answer is not in the context, say "I don't know."

<context>
{context}
</context>

Question: {question}

Answer in clear sentences.
At the end, list the sources you used as bullet points.
""")

# format documents
def format_docs(docs):
    return "\n\n".join(
        f"Source: {doc.metadata.get('source', 'unknown')}\n{doc.page_content}"
        for doc in docs
    )

# RAG chain Using LCEL
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)


In [57]:
query = "What AI projects has Kehinde worked on?"
response = rag_chain.invoke(query)
print(response)

Kehinde Akindele has worked on AI projects that involve building intelligent systems combining predictive modeling. Specific details about the AI projects are not provided in the context.

Sources:
- documents\Professional Resume.pdf


#### Conversational RAG

In [59]:
# Store for chat histories
chat_store = {}

def get_session_history(session_id: str):
    if session_id not in chat_store:
        chat_store[session_id] = InMemoryChatMessageHistory()
    return chat_store[session_id]

# Create conversational prompt
conv_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant answering questions about Kehinde Akindele using the provided documents. Use ONLY the context below to answer the question. If the answer is not in the context, say I don't know."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("system", "Answer in clear sentences. At the end, list the sources you used as bullet points."),
    ("human", "Context: {context}\n\nQuestion: {question}")
])

# Build base chain
conv_chain_base = (
    RunnableParallel(
        context=lambda x: format_docs(retriever.invoke(x["question"])),
        question=lambda x: x["question"],
        chat_history=lambda x: x.get("chat_history", [])
    )
    | conv_prompt
    | llm
    | StrOutputParser()
)

# Wrap with message history
conv_chain = RunnableWithMessageHistory(
    conv_chain_base,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history"
)



**Questions**

In [60]:
# First question
response = conv_chain.invoke(
    {"question": "What projects has Kehinde worked on?"},
    config={"configurable": {"session_id": "user_1"}}
)
print("Response 1:\n", response)

# Follow-up question
response2 = conv_chain.invoke(
    {"question": "Which of those involve RAG systems?"},
    config={"configurable": {"session_id": "user_1"}}
)

print("\nResponse 2:\n", response2)

Response 1:
 Kehinde Akindele has worked on projects involving building intelligent systems that combine predictive modeling.

Response 2:
 Kehinde Akindele has worked on projects involving RAG systems in building predictive models for sales forecasting in e-commerce and developing AI-powered chatbots for document-based question answering. 

Sources:
- Professional Resume.pdf
