# Step 1: Install the Required Libraries

In [1]:
!pip install openai torch langchain PyMuPDF langchain_community sentence-transformers chromadb langchain-openai

Collecting openai
  Downloading openai-1.35.13-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting langchain
  Downloading langchain-0.2.7-py3-none-any.whl (983 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.6/983.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.2.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [8]:
import os
from langchain_openai import ChatOpenAI

os.environ["OPENAI_API_KEY"] = ""

def load_model_openai(model_name):
    try:
        if model_name.lower().startswith('openai/'):
            model_id = model_name.split("/")[-1]  # Extract the model ID from the name
            print('OpenAI model: ', model_id)
            openai_api_key = os.getenv('OPENAI_API_KEY')
            if not openai_api_key:
                raise ValueError("OpenAI API key not found in environment variables.")
            return ChatOpenAI(model=model_id, temperature=0.2, api_key=openai_api_key)
    except Exception as e:
        print(f"Error loading model '{model_name}': {e}")
        return None

# Example usage:
llm = load_model_openai("openai/gpt-4o")
print('model:', llm)

OpenAI model:  gpt-4o
model: client=<openai.resources.chat.completions.Completions object at 0x79a81a1015d0> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x79a7ef1c8c10> model_name='gpt-4o' temperature=0.2 openai_api_key=SecretStr('**********') openai_proxy=''


# Step 2: Extract Text from PDF

In [10]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf("1980-oxford.pdf")
print(pdf_text)

Life & Arts
Why 1980s Oxford holds the key to Britain’s ruling class
From Partygate to Brexit, many of today’s political dramas can be traced back to the
leading players’ student days
6/15/24, 1:04 PM
Why 1980s Oxford holds the key to Britain’s ruling class
https://www.ft.com/content/2fa1e436-a5c7-43b1-9e5a-b1e1b43b8c3a
1/16
APRIL 14 2022
There’s a probably apocryphal saying that is attributed to Napoleon: “To understand the man, you
have to know what was happening in the world when he was 20.”
That’s how I try to understand the people now running the UK. Specifically, I imagine them as they
were at the University of Oxford, preparing for power. I see Boris Johnson as the tousle-headed
president of the Oxford Union debating society, in 1986. Michael Gove, today Johnson’s right-hand
man in cabinet, was the Union’s most incisive debater, despite his stock joke, “As I was telling my
Filipino manservant this morning . . . ” Their future political rivals, from David Cameron to Keir
Starmer,

# Step 3: Use Hugging Face Models with LangChain

In [14]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain

# Initialize the embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a vector store for retrieval
vector_store = Chroma(embedding_function=embeddings)
vector_store.add_texts([pdf_text])

# Define the prompt template
prompt_template = PromptTemplate(
    template="Use the following context to answer the question: {context}\nQuestion: {question}\nAnswer:",
    input_variables=["context", "question"]
)

# Load the OpenAI model
llm = load_model_openai("openai/gpt-4o")

# Ensure the model loaded correctly
if llm is None:
    raise ValueError("Failed to load the language model.")

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    chain_type_kwargs={"prompt": prompt_template}
)

# Ask a question
question = "What is the main topic of the PDF?"
answer = qa_chain.run(question)
print(answer)


  warn_deprecated(


OpenAI model:  gpt-4o
The main topic of the PDF is the influence of the University of Oxford in the 1980s on the current British ruling class. It explores how the student days of key political figures, particularly those involved in Brexit and other significant political events, shaped their ideologies and careers, ultimately leading to their prominent roles in the UK government today. The document also discusses the concept of "chumocracy" and the lifelong connections formed at Oxford that have impacted British politics and governance.


# 4. CSV RAG

In [15]:
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI

# Read the CSV file
df = pd.read_csv('incident-log.csv')

# Initialize the embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the DataFrame
loader = DataFrameLoader(df, page_content_column="IncidentDescription")
documents = loader.load()

# Split the documents
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# Create a vector store for retrieval
vector_store = Chroma.from_documents(texts, embedding=embeddings)

# Define the prompt template
prompt_template = PromptTemplate(
    template="Use the following context to answer the question about gas station incidents: {context}\nQuestion: {question}\nAnswer:",
    input_variables=["context", "question"]
)

# Load the OpenAI model (replace with your preferred model)
llm = OpenAI()

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    chain_type_kwargs={"prompt": prompt_template}
)

# Ask a question
question = "What are the most common types of incidents at the gas station?"
answer = qa_chain.run(question)
print(answer)

  warn_deprecated(


 The most common types of incidents at the gas station include fuel tank contamination, gas station logo light out, gas station roof damage, and debris in fuel.


#5. Similarity

In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# Read the CSV file
df = pd.read_csv('incident-log.csv')

# Initialize the embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Function to get embeddings for a list of texts
def get_embeddings(texts):
    return embeddings.embed_documents(texts)

# Get embeddings for all incident descriptions in the log
incident_embeddings = get_embeddings(df['IncidentDescription'].tolist())

# Function to find the most similar incident and its solution type
def find_similar_incident(new_incident):
    # Get embedding for the new incident
    new_incident_embedding = get_embeddings([new_incident])[0]

    # Calculate cosine similarity between new incident and all incidents in the log
    similarities = cosine_similarity([new_incident_embedding], incident_embeddings)[0]

    # Find the index of the most similar incident
    most_similar_index = np.argmax(similarities)

    # Get the most similar incident and its solution type
    most_similar_incident = df.iloc[most_similar_index]

    return most_similar_incident['IncidentDescription'], most_similar_incident['SolutionType']

# Example usage
new_incident = "Customer complaining about slow fuel pump"
similar_incident, solution_type = find_similar_incident(new_incident)

print(f"New Incident: {new_incident}")
print(f"Most Similar Incident in Log: {similar_incident}")
print(f"Recommended Solution Type: {solution_type}")

if solution_type == 'TechnicianDispatched':
    print("Recommendation: Dispatch a technician.")
else:
    print("Recommendation: Try to solve the issue over the phone.")



New Incident: Customer complaining about slow fuel pump
Most Similar Incident in Log: Fuel truck late
Recommended Solution Type: TechnicianDispatched
Recommendation: Dispatch a technician.
