# Install necessary packages
we will use langchain and and google generative ai sdk

In [None]:
!pip install langchain --quiet
!pip install langchain-google-genai --quiet
!pip install --upgrade --quiet  langchain-core langchain-community
!pip install --upgrade langchain --quiet
!pip install -U langchain-text-splitters --quiet
!pip install pypdf --quiet
# Assuming an embedding model is also set up via LangChain or another library

In [20]:
!pip install -q -U google-generativeai==0.7.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/163.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/163.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/717.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/717.3 kB[0m [31m22.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.3/717.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h

### add your google ai api key in the secrets in colab

In [22]:
# Used to securely store your API key
from google.colab import userdata

In [24]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

# for embedding model
genai.configure(api_key=GOOGLE_API_KEY)

In [56]:
import getpass
import os

# for llm
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# Load PDF and generate embeddings

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import format_document
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory

import numpy as np
import json

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

In [4]:
# Load PDF
loader = PyPDFLoader("/content/drive/MyDrive/Capstone Project/dataviz_1.pdf")
pages = loader.load_and_split()

# Split text into manageable chunks
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100)
document_chunks = text_splitter.split_documents(pages)

check the page content of first chunks

In [16]:
document_chunks[20].page_content

'Length\n20Incorrect bar graph on left \nand correct one on the right . \nThe chart on the right shows the \nchange when the axis starts at \nzero, which looks less dramatic \nfor differences.'

### test the embedding algorithm

In [25]:
import google.generativeai as genai

result = genai.embed_content(
    model="models/embedding-001",
    content="What is the meaning of life?",
    task_type="retrieval_document",
    title="Embedding of single string")

# 1 input > 1 vector output
print(str(result['embedding'])[:50], '... TRIMMED]')

[-0.003216741, -0.013358698, -0.017649598, -0.0091 ... TRIMMED]


In [28]:
len(result['embedding'])

768

### retrieve the embeddings of the pdf contents, chunk to chunk

In [33]:
# Generate embeddings
def generate_embeddings(text_chunks):
    return genai.embed_content(
    model="models/embedding-001",
    content=text_chunks,
    task_type="retrieval_document")['embedding']

embeddings = generate_embeddings([chunk.page_content for chunk in document_chunks])

In [43]:
print("total dim of embeddings from the whoe pdf (len of chunks, 768 vectors)")
len(embeddings), len(embeddings[0])

total dim of embeddings from the whoe pdf (len of chunks, 768 vectors)


(60, 768)

# Define a function to retrieve relevant sections

use **cosine similarity to retrive the top n similar chunks** from the pdf based on the query

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_relevant_sections(query, embeddings, chunks, top_k=3):
    query_embedding = generate_embeddings([query])[0]  # Assuming single query
    scores = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = np.argsort(scores)[-top_k:]
    return [chunks[i] for i in top_indices]


# Invoke the pipeline on a sample query

In [41]:
# Sample query
query = "Describe the main visualization techniques used in the data."

# Retrieve relevant sections
relevant_sections = retrieve_relevant_sections(query, embeddings, [chunk.page_content for chunk in document_chunks], top_k=5)

In [59]:
for i, rs in enumerate(relevant_sections):
  print("Relevant section", i)
  print(rs)

Relevant section 0
Boxplot on a
normal 
distribution
50Credit: https ://towardsdatascience.com/
Relevant section 1
Symbol plots
41
Relevant section 2
Boxplot
48
Credit: https ://towardsdatascience.com/Fence
Relevant section 3
Boxplot –
Notations
49Credit: https ://towardsdatascience.com/
Relevant section 4
33
Viewed separately, the visualization 
components aren’t that useful because 
they are just bits of geometry floating in 
an empty space without context. 
However, when we put the components 
togethe r, you get a complete visualization 
worth looking at.


# Setup LLM and create prompt

list the available gemini models

In [60]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-8b-exp-0827


we'll use the latest gemini-1.5-pro

In [79]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",
                             safety_settings={
                                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                                # HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                                }, temperature=0.7, top_p=0.85)

prompt = ChatPromptTemplate.from_template("Analyze the content following: {text} and answer the question: {question} \
                                            give the output in json format: {{'output': <output>\}}")

build a llm chain and invoke the prompts

In [80]:
llm_chain = (
              # Extract data from documents and add to the key "text"
              {
                "text": lambda content: "\n\n".join(s for s in content),
                "question": lambda question: question
                }
              # Prompt for Gemini
              | prompt
              # Gemini function
              | llm
              # Output parser
              | StrOutputParser()
            )

### Output

In [85]:
output = llm_chain.invoke({"text": relevant_sections, "question": query})

In [98]:
def get_json_output(response):
    response = response.strip("`json JSON").replace('\n','').replace('\t','')
    return json.loads(response)

In [102]:
get_json_output(output)['output']

"The provided text mentions several visualization techniques, primarily focusing on **boxplots**:* **Boxplot:** This is the central visualization method highlighted, emphasized by its repetition and detailed breakdown across different text segments. * **Symbol plots:** While briefly mentioned, the exact nature of these plots isn't detailed. They could represent various things like scatter plots, dot plots, etc., depending on the data and context.* **Notations:** This refers to the use of labels, titles, axis descriptions, and other textual elements within the visualizations to provide context and clarity.**The text emphasizes the importance of combining these elements.** Individual components like boxes, whiskers, or symbols are less meaningful in isolation.  A complete visualization combines these techniques to represent data distribution, outliers, and central tendencies effectively. "