In [2]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o", model_provider="openai")

In [3]:
#%pip install -qU langchain-openai

In [4]:

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [5]:
#%pip install -qU langchain-core


In [6]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [18]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict





# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/","https://www.greatyellow.earth/"),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
#prompt = hub.pull("rlm/rag-prompt")

from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
'''
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
'''
)


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [19]:
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistake

In [29]:
response = graph.invoke({"question": "explain the differences between a type 1, 2, and 3 model"})
print(response["answer"])

Type 1 models support the concept of “passthrough,” where acceptable publisher simulations are used with minor transformations. Type 2 models, known as Customer id OHE models, one-hot encode the `pca_id` column and exclude `client_id` and `campaign_id`, making them suitable for inferences where those columns weren't present in prior data. Type 3 models, or Campaign id OHE models, one-hot encode `campaign_id` and `client_id` along with others, allowing them to generate inferences for campaigns present in the previous week's training set.


In [14]:
response = graph.invoke({"question": "Who were the people resonabily for buidling and developing sight?"})
print(response["answer"])

The people responsible for building and developing SIGHT were primarily from the Optimization team at Marin Software, with contributions from authors and editors like Aniket Bharati and Kevin Dipasupil.


In [29]:
response = graph.invoke({"question": "What was his biggest achievement?"})
print(response["answer"])

His biggest achievement was leading the development of the 'Mother of All Models' (MoAM) which improved prediction accuracy by 40-60% and increased coverage from 4% to 100% across all ad types and publishers.


In [31]:
response = graph.invoke({"question": "how old is joe?"})
print(response["answer"])

I don't know how old Joe is based on the provided context.


In [34]:
response = graph.invoke({"question": "how many years experience does joe have in software engineeeing?"})
print(response["answer"])

Joe Southin has over 19 years of experience in software engineering and related fields, starting his career in 2001 at TDK Corporation and continuing through his role as Director of Engineering at Marin Software from 2017 to the present.


In [15]:
import os
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_PROJECT"] = "default"


In [15]:
%pip install -qU pypdf

Note: you may need to restart the kernel to use updated packages.


In [7]:
from langchain_community.document_loaders import PyPDFLoader

file_path = (
    "/Users/jsouthin/Documents/Joe Southin - CV 2025 (A4).pdf"
)

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [17]:
print(f"{pages[0].metadata}\n")
print(pages[0].page_content)

{'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-05-02T20:27:58+00:00', 'title': 'Joe Southin - CV 2025 ML Variant (A4) - Google Docs', 'moddate': '2025-05-02T20:27:58+00:00', 'source': '/Users/jsouthin/Documents/Joe Southin - CV 2025 (A4).pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}

JOE SOUTHIN 
 DATA SCIENCE & ENGINEERING LEADER 
 DRIVING SCALABLE INNOVATION & TEAM EXCELLENCE 
 Clophill, Beds, UK  joe.southin@gmail.com  +44 (0)7917 903642  linkedin.com/in/joesouthin 
 PROFESSIONAL SUMMARY 
 Machine learning and data science leader with a proven record of delivering scalable predictive systems and 
 measurable business impact. Experienced in building and leading global teams, architecting ML pipelines in enterprise 
 software, and applying advanced statistical techniques to solve high-value problems in marketing, forecasting, and 
 customer a

In [18]:
pages

[Document(metadata={'producer': 'Skia/PDF m135', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'creationdate': '2025-05-02T20:27:58+00:00', 'title': 'Joe Southin - CV 2025 ML Variant (A4) - Google Docs', 'moddate': '2025-05-02T20:27:58+00:00', 'source': '/Users/jsouthin/Documents/Joe Southin - CV 2025 (A4).pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="JOE SOUTHIN \n DATA SCIENCE & ENGINEERING LEADER \n DRIVING SCALABLE INNOVATION & TEAM EXCELLENCE \n Clophill, Beds, UK  joe.southin@gmail.com  +44 (0)7917 903642  linkedin.com/in/joesouthin \n PROFESSIONAL SUMMARY \n Machine learning and data science leader with a proven record of delivering scalable predictive systems and \n measurable business impact. Experienced in building and leading global teams, architecting ML pipelines in enterprise \n software, and applying advanced statistical techniques to solve high-value problems in 

In [19]:
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistake

In [35]:
%pip install langchain google-api-python-client google-auth


Collecting google-api-python-client
  Downloading google_api_python_client-2.169.0-py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth
  Downloading google_auth-2.40.1-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting cachetools<6.0,>=2.0.0 (from google-auth)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyasn1-modules>=0.2.1 (from google-auth)
  Downloadin

In [11]:
from langchain_community.document_loaders import GoogleDriveLoader
from pathlib import Path

loader = GoogleDriveLoader(
    document_ids=["1xCW-ZiquUxwBLpMTn9kL6WrvPlfeBYpDuicQhiIR81w"],
    service_account_key=Path("/Users/jsouthin/Downloads/lively-tensor-432422-c0-407d22e805d2.json")
)

docs = loader.load()

for doc in docs:
    print(doc.metadata)
    print(doc.page_content[:300])


{'source': 'https://docs.google.com/document/d/1xCW-ZiquUxwBLpMTn9kL6WrvPlfeBYpDuicQhiIR81w/edit', 'title': 'Ultimate guide_ Marin Sight', 'when': '2025-05-20T18:51:34.707Z'}
﻿Document Control
Overview
Problem statement
Feature description
Links
Abbreviations / Definitions
Phase 1 - MVP - Python scripts on appops machine
Prototype version
Production version
Flow diagram
Budget Optimizer Jobs Flow overview
[WIP] Feature description (verbose)
Job wise breakdown


In [11]:
%pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib)
  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib)
  Downloading oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)
Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl (19 kB)
Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)
Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
Installing collected packages: oauthlib, requests-oauthlib, google-auth-oauthlib
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [google-auth-oauthlib]1/3[0m [requests-oauthlib]
[1A[2KSuccessfully installed google-auth-oauthlib-1.2.2 oauthlib-3.2.2 requests-oauthlib-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install -U langchain-google-community


Collecting langchain-google-community
  Downloading langchain_google_community-2.0.7-py3-none-any.whl.metadata (3.5 kB)
Collecting google-cloud-core<3.0.0,>=2.4.2 (from langchain-google-community)
  Downloading google_cloud_core-2.4.3-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting grpcio<2.0.0,>=1.70.0 (from langchain-google-community)
  Downloading grpcio-1.71.0-cp311-cp311-macosx_10_14_universal2.whl.metadata (3.8 kB)
Downloading langchain_google_community-2.0.7-py3-none-any.whl (99 kB)
Downloading google_cloud_core-2.4.3-py2.py3-none-any.whl (29 kB)
Downloading grpcio-1.71.0-cp311-cp311-macosx_10_14_universal2.whl (11.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25hInstalling collected packages: grpcio, google-cloud-core, langchain-google-community
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [langchain-google-co

In [16]:
from langchain.vectorstores import FAISS

In [20]:
faiss_index = FAISS.from_documents(docs, embeddings)
faiss_index.save_local("faiss_index")

In [19]:
%pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl (3.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m1 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Note: you may need to restart the kernel to use updated packages.


In [22]:
faiss_index = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [23]:
faiss_index

<langchain_community.vectorstores.faiss.FAISS at 0x117505650>

In [30]:
pip install -U langchain langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [32]:
LANGSMITH_TRACING=true
LANGSMITH_ENDPOINT="https://eu.api.smith.langchain.com"
LANGSMITH_API_KEY="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"
OPENAI_API_KEY="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"